diff --git a/examples/fnv1a/fnv1a.s b/examples/fnv1a/fnv1a.s index 44b48cd..6628c37 100644 --- a/examples/fnv1a/fnv1a.s +++ b/examples/fnv1a/fnv1a.s @@ -3,18 +3,18 @@ #include "textflag.h" // func Hash64(data []byte) uint64 -TEXT ·Hash64(SB), NOSPLIT, $8-32 +TEXT ·Hash64(SB), NOSPLIT, $0-32 MOVQ data_base+0(FP), CX MOVQ data_len+8(FP), BX MOVQ $0xcbf29ce484222325, AX - MOVQ $0x00000100000001b3, BP + MOVQ $0x00000100000001b3, SI loop: CMPQ BX, $0x00 JE done MOVBQZX (CX), DX XORQ DX, AX - MULQ BP + MULQ SI INCQ CX DECQ BX JMP loop diff --git a/examples/sha1/sha1.s b/examples/sha1/sha1.s index 3dbca55..64807f7 100644 --- a/examples/sha1/sha1.s +++ b/examples/sha1/sha1.s @@ -8,272 +8,272 @@ TEXT ·block(SB), $64-32 // Load initial hash. MOVL (AX), DX MOVL 4(AX), BX - MOVL 8(AX), BP - MOVL 12(AX), SI - MOVL 16(AX), DI + MOVL 8(AX), SI + MOVL 12(AX), DI + MOVL 16(AX), R8 // Initialize registers. - MOVL DX, R8 - MOVL BX, R9 - MOVL BP, R10 + MOVL DX, R9 + MOVL BX, R10 MOVL SI, R11 MOVL DI, R12 + MOVL R8, R13 // Round 0. - MOVL (CX), R13 + MOVL (CX), R14 + BSWAPL R14 + MOVL R14, (SP) + MOVL R9, R15 + ROLL $0x05, R15 + MOVL R12, BP + XORL R11, BP + ANDL R10, BP + XORL R12, BP + ADDL BP, R15 + ADDL R13, R15 + ADDL $0x5a827999, R15 + ADDL R14, R15 + ROLL $0x1e, R10 + + // Round 1. + MOVL 4(CX), R13 BSWAPL R13 - MOVL R13, (SP) - MOVL R8, R14 + MOVL R13, 4(SP) + MOVL R15, R14 ROLL $0x05, R14 - MOVL R11, R15 - XORL R10, R15 - ANDL R9, R15 - XORL R11, R15 - ADDL R15, R14 + MOVL R11, BP + XORL R10, BP + ANDL R9, BP + XORL R11, BP + ADDL BP, R14 ADDL R12, R14 ADDL $0x5a827999, R14 ADDL R13, R14 ROLL $0x1e, R9 - // Round 1. - MOVL 4(CX), R12 + // Round 2. + MOVL 8(CX), R12 BSWAPL R12 - MOVL R12, 4(SP) + MOVL R12, 8(SP) MOVL R14, R13 ROLL $0x05, R13 - MOVL R10, R15 - XORL R9, R15 - ANDL R8, R15 - XORL R10, R15 - ADDL R15, R13 + MOVL R10, BP + XORL R9, BP + ANDL R15, BP + XORL R10, BP + ADDL BP, R13 ADDL R11, R13 ADDL $0x5a827999, R13 ADDL R12, R13 - ROLL $0x1e, R8 + ROLL $0x1e, R15 - // Round 2. - MOVL 8(CX), R11 + // Round 3. + MOVL 12(CX), R11 BSWAPL R11 - MOVL R11, 8(SP) + MOVL R11, 12(SP) MOVL R13, R12 ROLL $0x05, R12 - MOVL R9, R15 - XORL R8, R15 - ANDL R14, R15 - XORL R9, R15 - ADDL R15, R12 + MOVL R9, BP + XORL R15, BP + ANDL R14, BP + XORL R9, BP + ADDL BP, R12 ADDL R10, R12 ADDL $0x5a827999, R12 ADDL R11, R12 ROLL $0x1e, R14 - // Round 3. - MOVL 12(CX), R10 + // Round 4. + MOVL 16(CX), R10 BSWAPL R10 - MOVL R10, 12(SP) + MOVL R10, 16(SP) MOVL R12, R11 ROLL $0x05, R11 - MOVL R8, R15 - XORL R14, R15 - ANDL R13, R15 - XORL R8, R15 - ADDL R15, R11 + MOVL R15, BP + XORL R14, BP + ANDL R13, BP + XORL R15, BP + ADDL BP, R11 ADDL R9, R11 ADDL $0x5a827999, R11 ADDL R10, R11 ROLL $0x1e, R13 - // Round 4. - MOVL 16(CX), R9 + // Round 5. + MOVL 20(CX), R9 BSWAPL R9 - MOVL R9, 16(SP) + MOVL R9, 20(SP) MOVL R11, R10 ROLL $0x05, R10 - MOVL R14, R15 - XORL R13, R15 - ANDL R12, R15 - XORL R14, R15 + MOVL R14, BP + XORL R13, BP + ANDL R12, BP + XORL R14, BP + ADDL BP, R10 ADDL R15, R10 - ADDL R8, R10 ADDL $0x5a827999, R10 ADDL R9, R10 ROLL $0x1e, R12 - // Round 5. - MOVL 20(CX), R8 - BSWAPL R8 - MOVL R8, 20(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R13, R15 - XORL R12, R15 - ANDL R11, R15 - XORL R13, R15 - ADDL R15, R9 - ADDL R14, R9 - ADDL $0x5a827999, R9 - ADDL R8, R9 - ROLL $0x1e, R11 - // Round 6. - MOVL 24(CX), R8 - BSWAPL R8 - MOVL R8, 24(SP) - MOVL R9, R14 - ROLL $0x05, R14 - MOVL R12, R15 - XORL R11, R15 - ANDL R10, R15 - XORL R12, R15 - ADDL R15, R14 - ADDL R13, R14 - ADDL $0x5a827999, R14 - ADDL R8, R14 - ROLL $0x1e, R10 - - // Round 7. - MOVL 28(CX), R8 - BSWAPL R8 - MOVL R8, 28(SP) - MOVL R14, R13 - ROLL $0x05, R13 - MOVL R11, R15 - XORL R10, R15 - ANDL R9, R15 - XORL R11, R15 - ADDL R15, R13 - ADDL R12, R13 - ADDL $0x5a827999, R13 - ADDL R8, R13 - ROLL $0x1e, R9 - - // Round 8. - MOVL 32(CX), R8 - BSWAPL R8 - MOVL R8, 32(SP) - MOVL R13, R12 - ROLL $0x05, R12 + MOVL 24(CX), R9 + BSWAPL R9 + MOVL R9, 24(SP) MOVL R10, R15 - XORL R9, R15 - ANDL R14, R15 - XORL R10, R15 - ADDL R15, R12 - ADDL R11, R12 - ADDL $0x5a827999, R12 - ADDL R8, R12 - ROLL $0x1e, R14 - - // Round 9. - MOVL 36(CX), R8 - BSWAPL R8 - MOVL R8, 36(SP) - MOVL R12, R11 - ROLL $0x05, R11 - MOVL R9, R15 - XORL R14, R15 - ANDL R13, R15 - XORL R9, R15 - ADDL R15, R11 - ADDL R10, R11 - ADDL $0x5a827999, R11 - ADDL R8, R11 - ROLL $0x1e, R13 - - // Round 10. - MOVL 40(CX), R8 - BSWAPL R8 - MOVL R8, 40(SP) - MOVL R11, R10 - ROLL $0x05, R10 - MOVL R14, R15 - XORL R13, R15 - ANDL R12, R15 - XORL R14, R15 - ADDL R15, R10 - ADDL R9, R10 - ADDL $0x5a827999, R10 - ADDL R8, R10 - ROLL $0x1e, R12 - - // Round 11. - MOVL 44(CX), R8 - BSWAPL R8 - MOVL R8, 44(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R13, R15 - XORL R12, R15 - ANDL R11, R15 - XORL R13, R15 - ADDL R15, R9 - ADDL R14, R9 - ADDL $0x5a827999, R9 - ADDL R8, R9 + ROLL $0x05, R15 + MOVL R13, BP + XORL R12, BP + ANDL R11, BP + XORL R13, BP + ADDL BP, R15 + ADDL R14, R15 + ADDL $0x5a827999, R15 + ADDL R9, R15 ROLL $0x1e, R11 - // Round 12. - MOVL 48(CX), R8 - BSWAPL R8 - MOVL R8, 48(SP) - MOVL R9, R14 + // Round 7. + MOVL 28(CX), R9 + BSWAPL R9 + MOVL R9, 28(SP) + MOVL R15, R14 ROLL $0x05, R14 - MOVL R12, R15 - XORL R11, R15 - ANDL R10, R15 - XORL R12, R15 - ADDL R15, R14 + MOVL R12, BP + XORL R11, BP + ANDL R10, BP + XORL R12, BP + ADDL BP, R14 ADDL R13, R14 ADDL $0x5a827999, R14 - ADDL R8, R14 + ADDL R9, R14 ROLL $0x1e, R10 - // Round 13. - MOVL 52(CX), R8 - BSWAPL R8 - MOVL R8, 52(SP) + // Round 8. + MOVL 32(CX), R9 + BSWAPL R9 + MOVL R9, 32(SP) MOVL R14, R13 ROLL $0x05, R13 - MOVL R11, R15 - XORL R10, R15 - ANDL R9, R15 - XORL R11, R15 - ADDL R15, R13 + MOVL R11, BP + XORL R10, BP + ANDL R15, BP + XORL R11, BP + ADDL BP, R13 ADDL R12, R13 ADDL $0x5a827999, R13 - ADDL R8, R13 - ROLL $0x1e, R9 + ADDL R9, R13 + ROLL $0x1e, R15 - // Round 14. - MOVL 56(CX), R8 - BSWAPL R8 - MOVL R8, 56(SP) + // Round 9. + MOVL 36(CX), R9 + BSWAPL R9 + MOVL R9, 36(SP) MOVL R13, R12 ROLL $0x05, R12 - MOVL R10, R15 - XORL R9, R15 - ANDL R14, R15 - XORL R10, R15 - ADDL R15, R12 + MOVL R10, BP + XORL R15, BP + ANDL R14, BP + XORL R10, BP + ADDL BP, R12 ADDL R11, R12 ADDL $0x5a827999, R12 - ADDL R8, R12 + ADDL R9, R12 ROLL $0x1e, R14 + // Round 10. + MOVL 40(CX), R9 + BSWAPL R9 + MOVL R9, 40(SP) + MOVL R12, R11 + ROLL $0x05, R11 + MOVL R15, BP + XORL R14, BP + ANDL R13, BP + XORL R15, BP + ADDL BP, R11 + ADDL R10, R11 + ADDL $0x5a827999, R11 + ADDL R9, R11 + ROLL $0x1e, R13 + + // Round 11. + MOVL 44(CX), R9 + BSWAPL R9 + MOVL R9, 44(SP) + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R14, BP + XORL R13, BP + ANDL R12, BP + XORL R14, BP + ADDL BP, R10 + ADDL R15, R10 + ADDL $0x5a827999, R10 + ADDL R9, R10 + ROLL $0x1e, R12 + + // Round 12. + MOVL 48(CX), R9 + BSWAPL R9 + MOVL R9, 48(SP) + MOVL R10, R15 + ROLL $0x05, R15 + MOVL R13, BP + XORL R12, BP + ANDL R11, BP + XORL R13, BP + ADDL BP, R15 + ADDL R14, R15 + ADDL $0x5a827999, R15 + ADDL R9, R15 + ROLL $0x1e, R11 + + // Round 13. + MOVL 52(CX), R9 + BSWAPL R9 + MOVL R9, 52(SP) + MOVL R15, R14 + ROLL $0x05, R14 + MOVL R12, BP + XORL R11, BP + ANDL R10, BP + XORL R12, BP + ADDL BP, R14 + ADDL R13, R14 + ADDL $0x5a827999, R14 + ADDL R9, R14 + ROLL $0x1e, R10 + + // Round 14. + MOVL 56(CX), R9 + BSWAPL R9 + MOVL R9, 56(SP) + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R11, BP + XORL R10, BP + ANDL R15, BP + XORL R11, BP + ADDL BP, R13 + ADDL R12, R13 + ADDL $0x5a827999, R13 + ADDL R9, R13 + ROLL $0x1e, R15 + // Round 15. MOVL 60(CX), CX BSWAPL CX MOVL CX, 60(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R9, R11 - XORL R14, R11 - ANDL R13, R11 - XORL R9, R11 - ADDL R11, R8 - ADDL R10, R8 - ADDL $0x5a827999, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R10, R12 + XORL R15, R12 + ANDL R14, R12 + XORL R10, R12 + ADDL R12, R9 + ADDL R11, R9 + ADDL $0x5a827999, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 16. MOVL 52(SP), CX @@ -282,17 +282,17 @@ TEXT ·block(SB), $64-32 XORL (SP), CX ROLL $0x01, CX MOVL CX, (SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R14, R11 - XORL R13, R11 - ANDL R12, R11 - XORL R14, R11 - ADDL R11, R10 - ADDL R9, R10 - ADDL $0x5a827999, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R15, R12 + XORL R14, R12 + ANDL R13, R12 + XORL R15, R12 + ADDL R12, R11 + ADDL R10, R11 + ADDL $0x5a827999, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 17. MOVL 56(SP), CX @@ -301,17 +301,17 @@ TEXT ·block(SB), $64-32 XORL 4(SP), CX ROLL $0x01, CX MOVL CX, 4(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R13, R11 - XORL R12, R11 - ANDL R8, R11 - XORL R13, R11 - ADDL R11, R9 - ADDL R14, R9 - ADDL $0x5a827999, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R14, R12 + XORL R13, R12 + ANDL R9, R12 + XORL R14, R12 + ADDL R12, R10 + ADDL R15, R10 + ADDL $0x5a827999, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 18. MOVL 60(SP), CX @@ -320,17 +320,17 @@ TEXT ·block(SB), $64-32 XORL 8(SP), CX ROLL $0x01, CX MOVL CX, 8(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R12, R14 - XORL R8, R14 - ANDL R10, R14 - XORL R12, R14 - ADDL R14, R11 - ADDL R13, R11 - ADDL $0x5a827999, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R13, R15 + XORL R9, R15 + ANDL R11, R15 + XORL R13, R15 + ADDL R15, R12 + ADDL R14, R12 + ADDL $0x5a827999, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 19. MOVL (SP), CX @@ -339,17 +339,17 @@ TEXT ·block(SB), $64-32 XORL 12(SP), CX ROLL $0x01, CX MOVL CX, 12(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R8, R14 - XORL R10, R14 - ANDL R9, R14 - XORL R8, R14 - ADDL R14, R13 - ADDL R12, R13 - ADDL $0x5a827999, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R9, R15 + XORL R11, R15 + ANDL R10, R15 + XORL R9, R15 + ADDL R15, R14 + ADDL R13, R14 + ADDL $0x5a827999, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 20. MOVL 4(SP), CX @@ -358,16 +358,16 @@ TEXT ·block(SB), $64-32 XORL 16(SP), CX ROLL $0x01, CX MOVL CX, 16(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - XORL R9, R14 - XORL R10, R14 - ADDL R14, R12 - ADDL R8, R12 - ADDL $0x6ed9eba1, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + XORL R10, R15 + XORL R11, R15 + ADDL R15, R13 + ADDL R9, R13 + ADDL $0x6ed9eba1, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 21. MOVL 8(SP), CX @@ -376,16 +376,16 @@ TEXT ·block(SB), $64-32 XORL 20(SP), CX ROLL $0x01, CX MOVL CX, 20(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - XORL R11, R14 - XORL R9, R14 - ADDL R14, R8 - ADDL R10, R8 - ADDL $0x6ed9eba1, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + XORL R12, R15 + XORL R10, R15 + ADDL R15, R9 + ADDL R11, R9 + ADDL $0x6ed9eba1, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 22. MOVL 12(SP), CX @@ -394,16 +394,16 @@ TEXT ·block(SB), $64-32 XORL 24(SP), CX ROLL $0x01, CX MOVL CX, 24(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - XORL R13, R14 - XORL R11, R14 - ADDL R14, R10 - ADDL R9, R10 - ADDL $0x6ed9eba1, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 + XORL R14, R15 + XORL R12, R15 + ADDL R15, R11 + ADDL R10, R11 + ADDL $0x6ed9eba1, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 23. MOVL 16(SP), CX @@ -412,16 +412,16 @@ TEXT ·block(SB), $64-32 XORL 28(SP), CX ROLL $0x01, CX MOVL CX, 28(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - XORL R12, R14 - XORL R13, R14 - ADDL R14, R9 - ADDL R11, R9 - ADDL $0x6ed9eba1, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + XORL R13, R15 + XORL R14, R15 + ADDL R15, R10 + ADDL R12, R10 + ADDL $0x6ed9eba1, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 24. MOVL 20(SP), CX @@ -430,16 +430,16 @@ TEXT ·block(SB), $64-32 XORL 32(SP), CX ROLL $0x01, CX MOVL CX, 32(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - XORL R8, R14 - XORL R12, R14 - ADDL R14, R11 - ADDL R13, R11 - ADDL $0x6ed9eba1, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + XORL R9, R15 + XORL R13, R15 + ADDL R15, R12 + ADDL R14, R12 + ADDL $0x6ed9eba1, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 25. MOVL 24(SP), CX @@ -448,16 +448,16 @@ TEXT ·block(SB), $64-32 XORL 36(SP), CX ROLL $0x01, CX MOVL CX, 36(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - XORL R10, R14 - XORL R8, R14 - ADDL R14, R13 - ADDL R12, R13 - ADDL $0x6ed9eba1, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + XORL R11, R15 + XORL R9, R15 + ADDL R15, R14 + ADDL R13, R14 + ADDL $0x6ed9eba1, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 26. MOVL 28(SP), CX @@ -466,16 +466,16 @@ TEXT ·block(SB), $64-32 XORL 40(SP), CX ROLL $0x01, CX MOVL CX, 40(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - XORL R9, R14 - XORL R10, R14 - ADDL R14, R12 - ADDL R8, R12 - ADDL $0x6ed9eba1, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + XORL R10, R15 + XORL R11, R15 + ADDL R15, R13 + ADDL R9, R13 + ADDL $0x6ed9eba1, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 27. MOVL 32(SP), CX @@ -484,16 +484,16 @@ TEXT ·block(SB), $64-32 XORL 44(SP), CX ROLL $0x01, CX MOVL CX, 44(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - XORL R11, R14 - XORL R9, R14 - ADDL R14, R8 - ADDL R10, R8 - ADDL $0x6ed9eba1, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + XORL R12, R15 + XORL R10, R15 + ADDL R15, R9 + ADDL R11, R9 + ADDL $0x6ed9eba1, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 28. MOVL 36(SP), CX @@ -502,16 +502,16 @@ TEXT ·block(SB), $64-32 XORL 48(SP), CX ROLL $0x01, CX MOVL CX, 48(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - XORL R13, R14 - XORL R11, R14 - ADDL R14, R10 - ADDL R9, R10 - ADDL $0x6ed9eba1, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 + XORL R14, R15 + XORL R12, R15 + ADDL R15, R11 + ADDL R10, R11 + ADDL $0x6ed9eba1, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 29. MOVL 40(SP), CX @@ -520,16 +520,16 @@ TEXT ·block(SB), $64-32 XORL 52(SP), CX ROLL $0x01, CX MOVL CX, 52(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - XORL R12, R14 - XORL R13, R14 - ADDL R14, R9 - ADDL R11, R9 - ADDL $0x6ed9eba1, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + XORL R13, R15 + XORL R14, R15 + ADDL R15, R10 + ADDL R12, R10 + ADDL $0x6ed9eba1, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 30. MOVL 44(SP), CX @@ -538,16 +538,16 @@ TEXT ·block(SB), $64-32 XORL 56(SP), CX ROLL $0x01, CX MOVL CX, 56(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - XORL R8, R14 - XORL R12, R14 - ADDL R14, R11 - ADDL R13, R11 - ADDL $0x6ed9eba1, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + XORL R9, R15 + XORL R13, R15 + ADDL R15, R12 + ADDL R14, R12 + ADDL $0x6ed9eba1, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 31. MOVL 48(SP), CX @@ -556,16 +556,16 @@ TEXT ·block(SB), $64-32 XORL 60(SP), CX ROLL $0x01, CX MOVL CX, 60(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - XORL R10, R14 - XORL R8, R14 - ADDL R14, R13 - ADDL R12, R13 - ADDL $0x6ed9eba1, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + XORL R11, R15 + XORL R9, R15 + ADDL R15, R14 + ADDL R13, R14 + ADDL $0x6ed9eba1, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 32. MOVL 52(SP), CX @@ -574,16 +574,16 @@ TEXT ·block(SB), $64-32 XORL (SP), CX ROLL $0x01, CX MOVL CX, (SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - XORL R9, R14 - XORL R10, R14 - ADDL R14, R12 - ADDL R8, R12 - ADDL $0x6ed9eba1, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + XORL R10, R15 + XORL R11, R15 + ADDL R15, R13 + ADDL R9, R13 + ADDL $0x6ed9eba1, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 33. MOVL 56(SP), CX @@ -592,16 +592,16 @@ TEXT ·block(SB), $64-32 XORL 4(SP), CX ROLL $0x01, CX MOVL CX, 4(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - XORL R11, R14 - XORL R9, R14 - ADDL R14, R8 - ADDL R10, R8 - ADDL $0x6ed9eba1, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + XORL R12, R15 + XORL R10, R15 + ADDL R15, R9 + ADDL R11, R9 + ADDL $0x6ed9eba1, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 34. MOVL 60(SP), CX @@ -610,16 +610,16 @@ TEXT ·block(SB), $64-32 XORL 8(SP), CX ROLL $0x01, CX MOVL CX, 8(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - XORL R13, R14 - XORL R11, R14 - ADDL R14, R10 - ADDL R9, R10 - ADDL $0x6ed9eba1, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 + XORL R14, R15 + XORL R12, R15 + ADDL R15, R11 + ADDL R10, R11 + ADDL $0x6ed9eba1, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 35. MOVL (SP), CX @@ -628,16 +628,16 @@ TEXT ·block(SB), $64-32 XORL 12(SP), CX ROLL $0x01, CX MOVL CX, 12(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - XORL R12, R14 - XORL R13, R14 - ADDL R14, R9 - ADDL R11, R9 - ADDL $0x6ed9eba1, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + XORL R13, R15 + XORL R14, R15 + ADDL R15, R10 + ADDL R12, R10 + ADDL $0x6ed9eba1, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 36. MOVL 4(SP), CX @@ -646,16 +646,16 @@ TEXT ·block(SB), $64-32 XORL 16(SP), CX ROLL $0x01, CX MOVL CX, 16(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - XORL R8, R14 - XORL R12, R14 - ADDL R14, R11 - ADDL R13, R11 - ADDL $0x6ed9eba1, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + XORL R9, R15 + XORL R13, R15 + ADDL R15, R12 + ADDL R14, R12 + ADDL $0x6ed9eba1, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 37. MOVL 8(SP), CX @@ -664,16 +664,16 @@ TEXT ·block(SB), $64-32 XORL 20(SP), CX ROLL $0x01, CX MOVL CX, 20(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - XORL R10, R14 - XORL R8, R14 - ADDL R14, R13 - ADDL R12, R13 - ADDL $0x6ed9eba1, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + XORL R11, R15 + XORL R9, R15 + ADDL R15, R14 + ADDL R13, R14 + ADDL $0x6ed9eba1, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 38. MOVL 12(SP), CX @@ -682,16 +682,16 @@ TEXT ·block(SB), $64-32 XORL 24(SP), CX ROLL $0x01, CX MOVL CX, 24(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - XORL R9, R14 - XORL R10, R14 - ADDL R14, R12 - ADDL R8, R12 - ADDL $0x6ed9eba1, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + XORL R10, R15 + XORL R11, R15 + ADDL R15, R13 + ADDL R9, R13 + ADDL $0x6ed9eba1, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 39. MOVL 16(SP), CX @@ -700,16 +700,16 @@ TEXT ·block(SB), $64-32 XORL 28(SP), CX ROLL $0x01, CX MOVL CX, 28(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - XORL R11, R14 - XORL R9, R14 - ADDL R14, R8 - ADDL R10, R8 - ADDL $0x6ed9eba1, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + XORL R12, R15 + XORL R10, R15 + ADDL R15, R9 + ADDL R11, R9 + ADDL $0x6ed9eba1, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 40. MOVL 20(SP), CX @@ -718,19 +718,19 @@ TEXT ·block(SB), $64-32 XORL 32(SP), CX ROLL $0x01, CX MOVL CX, 32(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - ORL R13, R14 - ANDL R11, R14 - MOVL R12, R15 - ANDL R13, R15 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 ORL R14, R15 - ADDL R15, R10 - ADDL R9, R10 - ADDL $0x8f1bbcdc, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + ANDL R12, R15 + MOVL R13, BP + ANDL R14, BP + ORL R15, BP + ADDL BP, R11 + ADDL R10, R11 + ADDL $0x8f1bbcdc, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 41. MOVL 24(SP), CX @@ -739,19 +739,19 @@ TEXT ·block(SB), $64-32 XORL 36(SP), CX ROLL $0x01, CX MOVL CX, 36(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - ORL R12, R14 - ANDL R13, R14 - MOVL R8, R15 - ANDL R12, R15 - ORL R14, R15 - ADDL R15, R9 - ADDL R11, R9 - ADDL $0x8f1bbcdc, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + ORL R13, R15 + ANDL R14, R15 + MOVL R9, BP + ANDL R13, BP + ORL R15, BP + ADDL BP, R10 + ADDL R12, R10 + ADDL $0x8f1bbcdc, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 42. MOVL 28(SP), CX @@ -760,19 +760,19 @@ TEXT ·block(SB), $64-32 XORL 40(SP), CX ROLL $0x01, CX MOVL CX, 40(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - ORL R8, R14 - ANDL R12, R14 - MOVL R10, R15 - ANDL R8, R15 - ORL R14, R15 - ADDL R15, R11 - ADDL R13, R11 - ADDL $0x8f1bbcdc, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + ORL R9, R15 + ANDL R13, R15 + MOVL R11, BP + ANDL R9, BP + ORL R15, BP + ADDL BP, R12 + ADDL R14, R12 + ADDL $0x8f1bbcdc, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 43. MOVL 32(SP), CX @@ -781,19 +781,19 @@ TEXT ·block(SB), $64-32 XORL 44(SP), CX ROLL $0x01, CX MOVL CX, 44(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - ORL R10, R14 - ANDL R8, R14 - MOVL R9, R15 - ANDL R10, R15 - ORL R14, R15 - ADDL R15, R13 - ADDL R12, R13 - ADDL $0x8f1bbcdc, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + ORL R11, R15 + ANDL R9, R15 + MOVL R10, BP + ANDL R11, BP + ORL R15, BP + ADDL BP, R14 + ADDL R13, R14 + ADDL $0x8f1bbcdc, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 44. MOVL 36(SP), CX @@ -802,19 +802,19 @@ TEXT ·block(SB), $64-32 XORL 48(SP), CX ROLL $0x01, CX MOVL CX, 48(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - ORL R9, R14 - ANDL R10, R14 - MOVL R11, R15 - ANDL R9, R15 - ORL R14, R15 - ADDL R15, R12 - ADDL R8, R12 - ADDL $0x8f1bbcdc, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + ORL R10, R15 + ANDL R11, R15 + MOVL R12, BP + ANDL R10, BP + ORL R15, BP + ADDL BP, R13 + ADDL R9, R13 + ADDL $0x8f1bbcdc, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 45. MOVL 40(SP), CX @@ -823,19 +823,19 @@ TEXT ·block(SB), $64-32 XORL 52(SP), CX ROLL $0x01, CX MOVL CX, 52(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - ORL R11, R14 - ANDL R9, R14 - MOVL R13, R15 - ANDL R11, R15 - ORL R14, R15 - ADDL R15, R8 - ADDL R10, R8 - ADDL $0x8f1bbcdc, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + ORL R12, R15 + ANDL R10, R15 + MOVL R14, BP + ANDL R12, BP + ORL R15, BP + ADDL BP, R9 + ADDL R11, R9 + ADDL $0x8f1bbcdc, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 46. MOVL 44(SP), CX @@ -844,19 +844,19 @@ TEXT ·block(SB), $64-32 XORL 56(SP), CX ROLL $0x01, CX MOVL CX, 56(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - ORL R13, R14 - ANDL R11, R14 - MOVL R12, R15 - ANDL R13, R15 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 ORL R14, R15 - ADDL R15, R10 - ADDL R9, R10 - ADDL $0x8f1bbcdc, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + ANDL R12, R15 + MOVL R13, BP + ANDL R14, BP + ORL R15, BP + ADDL BP, R11 + ADDL R10, R11 + ADDL $0x8f1bbcdc, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 47. MOVL 48(SP), CX @@ -865,19 +865,19 @@ TEXT ·block(SB), $64-32 XORL 60(SP), CX ROLL $0x01, CX MOVL CX, 60(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - ORL R12, R14 - ANDL R13, R14 - MOVL R8, R15 - ANDL R12, R15 - ORL R14, R15 - ADDL R15, R9 - ADDL R11, R9 - ADDL $0x8f1bbcdc, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + ORL R13, R15 + ANDL R14, R15 + MOVL R9, BP + ANDL R13, BP + ORL R15, BP + ADDL BP, R10 + ADDL R12, R10 + ADDL $0x8f1bbcdc, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 48. MOVL 52(SP), CX @@ -886,19 +886,19 @@ TEXT ·block(SB), $64-32 XORL (SP), CX ROLL $0x01, CX MOVL CX, (SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - ORL R8, R14 - ANDL R12, R14 - MOVL R10, R15 - ANDL R8, R15 - ORL R14, R15 - ADDL R15, R11 - ADDL R13, R11 - ADDL $0x8f1bbcdc, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + ORL R9, R15 + ANDL R13, R15 + MOVL R11, BP + ANDL R9, BP + ORL R15, BP + ADDL BP, R12 + ADDL R14, R12 + ADDL $0x8f1bbcdc, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 49. MOVL 56(SP), CX @@ -907,19 +907,19 @@ TEXT ·block(SB), $64-32 XORL 4(SP), CX ROLL $0x01, CX MOVL CX, 4(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - ORL R10, R14 - ANDL R8, R14 - MOVL R9, R15 - ANDL R10, R15 - ORL R14, R15 - ADDL R15, R13 - ADDL R12, R13 - ADDL $0x8f1bbcdc, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + ORL R11, R15 + ANDL R9, R15 + MOVL R10, BP + ANDL R11, BP + ORL R15, BP + ADDL BP, R14 + ADDL R13, R14 + ADDL $0x8f1bbcdc, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 50. MOVL 60(SP), CX @@ -928,19 +928,19 @@ TEXT ·block(SB), $64-32 XORL 8(SP), CX ROLL $0x01, CX MOVL CX, 8(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - ORL R9, R14 - ANDL R10, R14 - MOVL R11, R15 - ANDL R9, R15 - ORL R14, R15 - ADDL R15, R12 - ADDL R8, R12 - ADDL $0x8f1bbcdc, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + ORL R10, R15 + ANDL R11, R15 + MOVL R12, BP + ANDL R10, BP + ORL R15, BP + ADDL BP, R13 + ADDL R9, R13 + ADDL $0x8f1bbcdc, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 51. MOVL (SP), CX @@ -949,19 +949,19 @@ TEXT ·block(SB), $64-32 XORL 12(SP), CX ROLL $0x01, CX MOVL CX, 12(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - ORL R11, R14 - ANDL R9, R14 - MOVL R13, R15 - ANDL R11, R15 - ORL R14, R15 - ADDL R15, R8 - ADDL R10, R8 - ADDL $0x8f1bbcdc, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + ORL R12, R15 + ANDL R10, R15 + MOVL R14, BP + ANDL R12, BP + ORL R15, BP + ADDL BP, R9 + ADDL R11, R9 + ADDL $0x8f1bbcdc, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 52. MOVL 4(SP), CX @@ -970,19 +970,19 @@ TEXT ·block(SB), $64-32 XORL 16(SP), CX ROLL $0x01, CX MOVL CX, 16(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - ORL R13, R14 - ANDL R11, R14 - MOVL R12, R15 - ANDL R13, R15 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 ORL R14, R15 - ADDL R15, R10 - ADDL R9, R10 - ADDL $0x8f1bbcdc, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + ANDL R12, R15 + MOVL R13, BP + ANDL R14, BP + ORL R15, BP + ADDL BP, R11 + ADDL R10, R11 + ADDL $0x8f1bbcdc, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 53. MOVL 8(SP), CX @@ -991,19 +991,19 @@ TEXT ·block(SB), $64-32 XORL 20(SP), CX ROLL $0x01, CX MOVL CX, 20(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - ORL R12, R14 - ANDL R13, R14 - MOVL R8, R15 - ANDL R12, R15 - ORL R14, R15 - ADDL R15, R9 - ADDL R11, R9 - ADDL $0x8f1bbcdc, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + ORL R13, R15 + ANDL R14, R15 + MOVL R9, BP + ANDL R13, BP + ORL R15, BP + ADDL BP, R10 + ADDL R12, R10 + ADDL $0x8f1bbcdc, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 54. MOVL 12(SP), CX @@ -1012,19 +1012,19 @@ TEXT ·block(SB), $64-32 XORL 24(SP), CX ROLL $0x01, CX MOVL CX, 24(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - ORL R8, R14 - ANDL R12, R14 - MOVL R10, R15 - ANDL R8, R15 - ORL R14, R15 - ADDL R15, R11 - ADDL R13, R11 - ADDL $0x8f1bbcdc, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + ORL R9, R15 + ANDL R13, R15 + MOVL R11, BP + ANDL R9, BP + ORL R15, BP + ADDL BP, R12 + ADDL R14, R12 + ADDL $0x8f1bbcdc, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 55. MOVL 16(SP), CX @@ -1033,19 +1033,19 @@ TEXT ·block(SB), $64-32 XORL 28(SP), CX ROLL $0x01, CX MOVL CX, 28(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - ORL R10, R14 - ANDL R8, R14 - MOVL R9, R15 - ANDL R10, R15 - ORL R14, R15 - ADDL R15, R13 - ADDL R12, R13 - ADDL $0x8f1bbcdc, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + ORL R11, R15 + ANDL R9, R15 + MOVL R10, BP + ANDL R11, BP + ORL R15, BP + ADDL BP, R14 + ADDL R13, R14 + ADDL $0x8f1bbcdc, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 56. MOVL 20(SP), CX @@ -1054,19 +1054,19 @@ TEXT ·block(SB), $64-32 XORL 32(SP), CX ROLL $0x01, CX MOVL CX, 32(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - ORL R9, R14 - ANDL R10, R14 - MOVL R11, R15 - ANDL R9, R15 - ORL R14, R15 - ADDL R15, R12 - ADDL R8, R12 - ADDL $0x8f1bbcdc, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + ORL R10, R15 + ANDL R11, R15 + MOVL R12, BP + ANDL R10, BP + ORL R15, BP + ADDL BP, R13 + ADDL R9, R13 + ADDL $0x8f1bbcdc, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 57. MOVL 24(SP), CX @@ -1075,19 +1075,19 @@ TEXT ·block(SB), $64-32 XORL 36(SP), CX ROLL $0x01, CX MOVL CX, 36(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - ORL R11, R14 - ANDL R9, R14 - MOVL R13, R15 - ANDL R11, R15 - ORL R14, R15 - ADDL R15, R8 - ADDL R10, R8 - ADDL $0x8f1bbcdc, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + ORL R12, R15 + ANDL R10, R15 + MOVL R14, BP + ANDL R12, BP + ORL R15, BP + ADDL BP, R9 + ADDL R11, R9 + ADDL $0x8f1bbcdc, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 58. MOVL 28(SP), CX @@ -1096,19 +1096,19 @@ TEXT ·block(SB), $64-32 XORL 40(SP), CX ROLL $0x01, CX MOVL CX, 40(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - ORL R13, R14 - ANDL R11, R14 - MOVL R12, R15 - ANDL R13, R15 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 ORL R14, R15 - ADDL R15, R10 - ADDL R9, R10 - ADDL $0x8f1bbcdc, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + ANDL R12, R15 + MOVL R13, BP + ANDL R14, BP + ORL R15, BP + ADDL BP, R11 + ADDL R10, R11 + ADDL $0x8f1bbcdc, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 59. MOVL 32(SP), CX @@ -1117,19 +1117,19 @@ TEXT ·block(SB), $64-32 XORL 44(SP), CX ROLL $0x01, CX MOVL CX, 44(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - ORL R12, R14 - ANDL R13, R14 - MOVL R8, R15 - ANDL R12, R15 - ORL R14, R15 - ADDL R15, R9 - ADDL R11, R9 - ADDL $0x8f1bbcdc, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + ORL R13, R15 + ANDL R14, R15 + MOVL R9, BP + ANDL R13, BP + ORL R15, BP + ADDL BP, R10 + ADDL R12, R10 + ADDL $0x8f1bbcdc, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 60. MOVL 36(SP), CX @@ -1138,16 +1138,16 @@ TEXT ·block(SB), $64-32 XORL 48(SP), CX ROLL $0x01, CX MOVL CX, 48(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - XORL R8, R14 - XORL R12, R14 - ADDL R14, R11 - ADDL R13, R11 - ADDL $0xca62c1d6, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + XORL R9, R15 + XORL R13, R15 + ADDL R15, R12 + ADDL R14, R12 + ADDL $0xca62c1d6, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 61. MOVL 40(SP), CX @@ -1156,16 +1156,16 @@ TEXT ·block(SB), $64-32 XORL 52(SP), CX ROLL $0x01, CX MOVL CX, 52(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - XORL R10, R14 - XORL R8, R14 - ADDL R14, R13 - ADDL R12, R13 - ADDL $0xca62c1d6, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + XORL R11, R15 + XORL R9, R15 + ADDL R15, R14 + ADDL R13, R14 + ADDL $0xca62c1d6, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 62. MOVL 44(SP), CX @@ -1174,16 +1174,16 @@ TEXT ·block(SB), $64-32 XORL 56(SP), CX ROLL $0x01, CX MOVL CX, 56(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - XORL R9, R14 - XORL R10, R14 - ADDL R14, R12 - ADDL R8, R12 - ADDL $0xca62c1d6, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + XORL R10, R15 + XORL R11, R15 + ADDL R15, R13 + ADDL R9, R13 + ADDL $0xca62c1d6, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 63. MOVL 48(SP), CX @@ -1192,16 +1192,16 @@ TEXT ·block(SB), $64-32 XORL 60(SP), CX ROLL $0x01, CX MOVL CX, 60(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - XORL R11, R14 - XORL R9, R14 - ADDL R14, R8 - ADDL R10, R8 - ADDL $0xca62c1d6, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + XORL R12, R15 + XORL R10, R15 + ADDL R15, R9 + ADDL R11, R9 + ADDL $0xca62c1d6, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 64. MOVL 52(SP), CX @@ -1210,16 +1210,16 @@ TEXT ·block(SB), $64-32 XORL (SP), CX ROLL $0x01, CX MOVL CX, (SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - XORL R13, R14 - XORL R11, R14 - ADDL R14, R10 - ADDL R9, R10 - ADDL $0xca62c1d6, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 + XORL R14, R15 + XORL R12, R15 + ADDL R15, R11 + ADDL R10, R11 + ADDL $0xca62c1d6, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 65. MOVL 56(SP), CX @@ -1228,16 +1228,16 @@ TEXT ·block(SB), $64-32 XORL 4(SP), CX ROLL $0x01, CX MOVL CX, 4(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - XORL R12, R14 - XORL R13, R14 - ADDL R14, R9 - ADDL R11, R9 - ADDL $0xca62c1d6, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + XORL R13, R15 + XORL R14, R15 + ADDL R15, R10 + ADDL R12, R10 + ADDL $0xca62c1d6, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 66. MOVL 60(SP), CX @@ -1246,16 +1246,16 @@ TEXT ·block(SB), $64-32 XORL 8(SP), CX ROLL $0x01, CX MOVL CX, 8(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - XORL R8, R14 - XORL R12, R14 - ADDL R14, R11 - ADDL R13, R11 - ADDL $0xca62c1d6, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + XORL R9, R15 + XORL R13, R15 + ADDL R15, R12 + ADDL R14, R12 + ADDL $0xca62c1d6, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 67. MOVL (SP), CX @@ -1264,16 +1264,16 @@ TEXT ·block(SB), $64-32 XORL 12(SP), CX ROLL $0x01, CX MOVL CX, 12(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - XORL R10, R14 - XORL R8, R14 - ADDL R14, R13 - ADDL R12, R13 - ADDL $0xca62c1d6, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + XORL R11, R15 + XORL R9, R15 + ADDL R15, R14 + ADDL R13, R14 + ADDL $0xca62c1d6, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 68. MOVL 4(SP), CX @@ -1282,16 +1282,16 @@ TEXT ·block(SB), $64-32 XORL 16(SP), CX ROLL $0x01, CX MOVL CX, 16(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - XORL R9, R14 - XORL R10, R14 - ADDL R14, R12 - ADDL R8, R12 - ADDL $0xca62c1d6, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + XORL R10, R15 + XORL R11, R15 + ADDL R15, R13 + ADDL R9, R13 + ADDL $0xca62c1d6, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 69. MOVL 8(SP), CX @@ -1300,16 +1300,16 @@ TEXT ·block(SB), $64-32 XORL 20(SP), CX ROLL $0x01, CX MOVL CX, 20(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - XORL R11, R14 - XORL R9, R14 - ADDL R14, R8 - ADDL R10, R8 - ADDL $0xca62c1d6, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + XORL R12, R15 + XORL R10, R15 + ADDL R15, R9 + ADDL R11, R9 + ADDL $0xca62c1d6, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 70. MOVL 12(SP), CX @@ -1318,16 +1318,16 @@ TEXT ·block(SB), $64-32 XORL 24(SP), CX ROLL $0x01, CX MOVL CX, 24(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - XORL R13, R14 - XORL R11, R14 - ADDL R14, R10 - ADDL R9, R10 - ADDL $0xca62c1d6, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 + XORL R14, R15 + XORL R12, R15 + ADDL R15, R11 + ADDL R10, R11 + ADDL $0xca62c1d6, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 71. MOVL 16(SP), CX @@ -1336,16 +1336,16 @@ TEXT ·block(SB), $64-32 XORL 28(SP), CX ROLL $0x01, CX MOVL CX, 28(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - XORL R12, R14 - XORL R13, R14 - ADDL R14, R9 - ADDL R11, R9 - ADDL $0xca62c1d6, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + XORL R13, R15 + XORL R14, R15 + ADDL R15, R10 + ADDL R12, R10 + ADDL $0xca62c1d6, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 72. MOVL 20(SP), CX @@ -1354,16 +1354,16 @@ TEXT ·block(SB), $64-32 XORL 32(SP), CX ROLL $0x01, CX MOVL CX, 32(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - XORL R8, R14 - XORL R12, R14 - ADDL R14, R11 - ADDL R13, R11 - ADDL $0xca62c1d6, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + XORL R9, R15 + XORL R13, R15 + ADDL R15, R12 + ADDL R14, R12 + ADDL $0xca62c1d6, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 73. MOVL 24(SP), CX @@ -1372,16 +1372,16 @@ TEXT ·block(SB), $64-32 XORL 36(SP), CX ROLL $0x01, CX MOVL CX, 36(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - XORL R10, R14 - XORL R8, R14 - ADDL R14, R13 - ADDL R12, R13 - ADDL $0xca62c1d6, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + XORL R11, R15 + XORL R9, R15 + ADDL R15, R14 + ADDL R13, R14 + ADDL $0xca62c1d6, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Round 74. MOVL 28(SP), CX @@ -1390,16 +1390,16 @@ TEXT ·block(SB), $64-32 XORL 40(SP), CX ROLL $0x01, CX MOVL CX, 40(SP) - MOVL R13, R12 - ROLL $0x05, R12 - MOVL R11, R14 - XORL R9, R14 - XORL R10, R14 - ADDL R14, R12 - ADDL R8, R12 - ADDL $0xca62c1d6, R12 - ADDL CX, R12 - ROLL $0x1e, R11 + MOVL R14, R13 + ROLL $0x05, R13 + MOVL R12, R15 + XORL R10, R15 + XORL R11, R15 + ADDL R15, R13 + ADDL R9, R13 + ADDL $0xca62c1d6, R13 + ADDL CX, R13 + ROLL $0x1e, R12 // Round 75. MOVL 32(SP), CX @@ -1408,16 +1408,16 @@ TEXT ·block(SB), $64-32 XORL 44(SP), CX ROLL $0x01, CX MOVL CX, 44(SP) - MOVL R12, R8 - ROLL $0x05, R8 - MOVL R13, R14 - XORL R11, R14 - XORL R9, R14 - ADDL R14, R8 - ADDL R10, R8 - ADDL $0xca62c1d6, R8 - ADDL CX, R8 - ROLL $0x1e, R13 + MOVL R13, R9 + ROLL $0x05, R9 + MOVL R14, R15 + XORL R12, R15 + XORL R10, R15 + ADDL R15, R9 + ADDL R11, R9 + ADDL $0xca62c1d6, R9 + ADDL CX, R9 + ROLL $0x1e, R14 // Round 76. MOVL 36(SP), CX @@ -1426,16 +1426,16 @@ TEXT ·block(SB), $64-32 XORL 48(SP), CX ROLL $0x01, CX MOVL CX, 48(SP) - MOVL R8, R10 - ROLL $0x05, R10 - MOVL R12, R14 - XORL R13, R14 - XORL R11, R14 - ADDL R14, R10 - ADDL R9, R10 - ADDL $0xca62c1d6, R10 - ADDL CX, R10 - ROLL $0x1e, R12 + MOVL R9, R11 + ROLL $0x05, R11 + MOVL R13, R15 + XORL R14, R15 + XORL R12, R15 + ADDL R15, R11 + ADDL R10, R11 + ADDL $0xca62c1d6, R11 + ADDL CX, R11 + ROLL $0x1e, R13 // Round 77. MOVL 40(SP), CX @@ -1444,16 +1444,16 @@ TEXT ·block(SB), $64-32 XORL 52(SP), CX ROLL $0x01, CX MOVL CX, 52(SP) - MOVL R10, R9 - ROLL $0x05, R9 - MOVL R8, R14 - XORL R12, R14 - XORL R13, R14 - ADDL R14, R9 - ADDL R11, R9 - ADDL $0xca62c1d6, R9 - ADDL CX, R9 - ROLL $0x1e, R8 + MOVL R11, R10 + ROLL $0x05, R10 + MOVL R9, R15 + XORL R13, R15 + XORL R14, R15 + ADDL R15, R10 + ADDL R12, R10 + ADDL $0xca62c1d6, R10 + ADDL CX, R10 + ROLL $0x1e, R9 // Round 78. MOVL 44(SP), CX @@ -1462,16 +1462,16 @@ TEXT ·block(SB), $64-32 XORL 56(SP), CX ROLL $0x01, CX MOVL CX, 56(SP) - MOVL R9, R11 - ROLL $0x05, R11 - MOVL R10, R14 - XORL R8, R14 - XORL R12, R14 - ADDL R14, R11 - ADDL R13, R11 - ADDL $0xca62c1d6, R11 - ADDL CX, R11 - ROLL $0x1e, R10 + MOVL R10, R12 + ROLL $0x05, R12 + MOVL R11, R15 + XORL R9, R15 + XORL R13, R15 + ADDL R15, R12 + ADDL R14, R12 + ADDL $0xca62c1d6, R12 + ADDL CX, R12 + ROLL $0x1e, R11 // Round 79. MOVL 48(SP), CX @@ -1480,28 +1480,28 @@ TEXT ·block(SB), $64-32 XORL 60(SP), CX ROLL $0x01, CX MOVL CX, 60(SP) - MOVL R11, R13 - ROLL $0x05, R13 - MOVL R9, R14 - XORL R10, R14 - XORL R8, R14 - ADDL R14, R13 - ADDL R12, R13 - ADDL $0xca62c1d6, R13 - ADDL CX, R13 - ROLL $0x1e, R9 + MOVL R12, R14 + ROLL $0x05, R14 + MOVL R10, R15 + XORL R11, R15 + XORL R9, R15 + ADDL R15, R14 + ADDL R13, R14 + ADDL $0xca62c1d6, R14 + ADDL CX, R14 + ROLL $0x1e, R10 // Final add. - ADDL R13, DX - ADDL R11, BX - ADDL R9, BP + ADDL R14, DX + ADDL R12, BX ADDL R10, SI - ADDL R8, DI + ADDL R11, DI + ADDL R9, R8 // Store results back. MOVL DX, (AX) MOVL BX, 4(AX) - MOVL BP, 8(AX) - MOVL SI, 12(AX) - MOVL DI, 16(AX) + MOVL SI, 8(AX) + MOVL DI, 12(AX) + MOVL R8, 16(AX) RET diff --git a/examples/stadtx/stadtx.s b/examples/stadtx/stadtx.s index 2a52cc4..8a2b892 100644 --- a/examples/stadtx/stadtx.s +++ b/examples/stadtx/stadtx.s @@ -3,68 +3,68 @@ #include "textflag.h" // func Hash(state *State, key []byte) uint64 -TEXT ·Hash(SB), NOSPLIT, $8-40 +TEXT ·Hash(SB), NOSPLIT, $0-40 MOVQ state+0(FP), AX MOVQ key_base+8(FP), CX MOVQ key_len+16(FP), DX MOVQ (AX), BX - MOVQ 8(AX), BP - MOVQ DX, SI - ADDQ $0x00000001, SI - MOVQ $0xb89b0f8e1655514f, DI - IMULQ DI, SI - XORQ SI, BX - MOVQ DX, SI - ADDQ $0x00000002, SI - MOVQ $0x8c6f736011bd5127, DI - IMULQ DI, SI - XORQ SI, BP + MOVQ 8(AX), SI + MOVQ DX, DI + ADDQ $0x00000001, DI + MOVQ $0xb89b0f8e1655514f, R8 + IMULQ R8, DI + XORQ DI, BX + MOVQ DX, DI + ADDQ $0x00000002, DI + MOVQ $0x8c6f736011bd5127, R8 + IMULQ R8, DI + XORQ DI, SI CMPQ DX, $0x00000020 JGE coreLong - MOVQ DX, SI - SHRQ $0x03, SI - CMPQ SI, $0x00000000 + MOVQ DX, DI + SHRQ $0x03, DI + CMPQ DI, $0x00000000 JE shortCore0 - CMPQ SI, $0x00000001 + CMPQ DI, $0x00000001 JE shortCore1 - CMPQ SI, $0x00000002 + CMPQ DI, $0x00000002 JE shortCore2 - CMPQ SI, $0x00000003 + CMPQ DI, $0x00000003 JE shortCore3 shortCore3: MOVQ (CX), AX - MOVQ $0x9c1b8e1e9628323f, SI - IMULQ SI, AX + MOVQ $0x9c1b8e1e9628323f, DI + IMULQ DI, AX ADDQ AX, BX RORQ $0x11, BX - XORQ BP, BX - RORQ $0x35, BP - ADDQ BX, BP + XORQ SI, BX + RORQ $0x35, SI + ADDQ BX, SI ADDQ $0x00000008, CX SUBQ $0x00000008, DX shortCore2: MOVQ (CX), AX - MOVQ $0x9c1b8e1e9628323f, SI - IMULQ SI, AX + MOVQ $0x9c1b8e1e9628323f, DI + IMULQ DI, AX ADDQ AX, BX RORQ $0x11, BX - XORQ BP, BX - RORQ $0x35, BP - ADDQ BX, BP + XORQ SI, BX + RORQ $0x35, SI + ADDQ BX, SI ADDQ $0x00000008, CX SUBQ $0x00000008, DX shortCore1: MOVQ (CX), AX - MOVQ $0x9c1b8e1e9628323f, SI - IMULQ SI, AX + MOVQ $0x9c1b8e1e9628323f, DI + IMULQ DI, AX ADDQ AX, BX RORQ $0x11, BX - XORQ BP, BX - RORQ $0x35, BP - ADDQ BX, BP + XORQ SI, BX + RORQ $0x35, SI + ADDQ BX, SI ADDQ $0x00000008, CX SUBQ $0x00000008, DX @@ -94,7 +94,7 @@ shortTail7: shortTail6: MOVBQZX 5(CX), DX SHLQ $0x30, DX - ADDQ DX, BP + ADDQ DX, SI shortTail5: MOVBQZX 4(CX), DX @@ -103,7 +103,7 @@ shortTail5: shortTail4: MOVLQZX (CX), DX - ADDQ DX, BP + ADDQ DX, SI JMP shortAfter shortTail3: @@ -113,7 +113,7 @@ shortTail3: shortTail2: MOVWQZX (CX), DX - ADDQ DX, BP + ADDQ DX, SI JMP shortAfter shortTail1: @@ -121,129 +121,129 @@ shortTail1: ADDQ DX, BX shortTail0: - RORQ $0x20, BP - XORQ $0x000000ff, BP + RORQ $0x20, SI + XORQ $0x000000ff, SI shortAfter: - XORQ BX, BP + XORQ BX, SI RORQ $0x21, BX - ADDQ BP, BX - ROLQ $0x11, BP - XORQ BX, BP + ADDQ SI, BX + ROLQ $0x11, SI + XORQ BX, SI ROLQ $0x2b, BX - ADDQ BP, BX - ROLQ $0x1f, BP - SUBQ BX, BP + ADDQ SI, BX + ROLQ $0x1f, SI + SUBQ BX, SI ROLQ $0x0d, BX - XORQ BP, BX - SUBQ BX, BP + XORQ SI, BX + SUBQ BX, SI ROLQ $0x29, BX - ADDQ BP, BX - ROLQ $0x25, BP - XORQ BX, BP + ADDQ SI, BX + ROLQ $0x25, SI + XORQ BX, SI RORQ $0x27, BX - ADDQ BP, BX - RORQ $0x0f, BP - ADDQ BX, BP + ADDQ SI, BX + RORQ $0x0f, SI + ADDQ BX, SI ROLQ $0x0f, BX - XORQ BP, BX - RORQ $0x05, BP - XORQ BP, BX + XORQ SI, BX + RORQ $0x05, SI + XORQ SI, BX MOVQ BX, ret+32(FP) RET coreLong: - MOVQ 16(AX), DI + MOVQ 16(AX), R8 MOVQ 24(AX), AX - MOVQ DX, SI - ADDQ $0x00000003, SI - MOVQ $0x8f29bd94edce7b39, R8 - IMULQ R8, SI - XORQ SI, DI - MOVQ DX, SI - ADDQ $0x00000004, SI - MOVQ $0x9c1b8e1e9628323f, R8 - IMULQ R8, SI - XORQ SI, AX + MOVQ DX, DI + ADDQ $0x00000003, DI + MOVQ $0x8f29bd94edce7b39, R9 + IMULQ R9, DI + XORQ DI, R8 + MOVQ DX, DI + ADDQ $0x00000004, DI + MOVQ $0x9c1b8e1e9628323f, R9 + IMULQ R9, DI + XORQ DI, AX block: - MOVQ (CX), SI - MOVQ $0x00000000802910e3, R8 - IMULQ R8, SI - ADDQ SI, BX + MOVQ (CX), DI + MOVQ $0x00000000802910e3, R9 + IMULQ R9, DI + ADDQ DI, BX ROLQ $0x39, BX XORQ AX, BX - MOVQ 8(CX), SI - MOVQ $0x00000000819b13af, R8 - IMULQ R8, SI - ADDQ SI, BP - ROLQ $0x3f, BP - XORQ DI, BP - MOVQ 16(CX), SI - MOVQ $0x0000000091cb27e5, R8 - IMULQ R8, SI - ADDQ SI, DI - RORQ $0x2f, DI - ADDQ BX, DI - MOVQ 24(CX), SI - MOVQ $0x00000000c1a269c1, R8 - IMULQ R8, SI - ADDQ SI, AX + MOVQ 8(CX), DI + MOVQ $0x00000000819b13af, R9 + IMULQ R9, DI + ADDQ DI, SI + ROLQ $0x3f, SI + XORQ R8, SI + MOVQ 16(CX), DI + MOVQ $0x0000000091cb27e5, R9 + IMULQ R9, DI + ADDQ DI, R8 + RORQ $0x2f, R8 + ADDQ BX, R8 + MOVQ 24(CX), DI + MOVQ $0x00000000c1a269c1, R9 + IMULQ R9, DI + ADDQ DI, AX RORQ $0x0b, AX - SUBQ BP, AX + SUBQ SI, AX ADDQ $0x00000020, CX SUBQ $0x00000020, DX CMPQ DX, $0x00000020 JGE block - MOVQ DX, R8 - MOVQ DX, SI - SHRQ $0x03, SI - CMPQ SI, $0x00000000 + MOVQ DX, R9 + MOVQ DX, DI + SHRQ $0x03, DI + CMPQ DI, $0x00000000 JE longCore0 - CMPQ SI, $0x00000001 + CMPQ DI, $0x00000001 JE longCore1 - CMPQ SI, $0x00000002 + CMPQ DI, $0x00000002 JE longCore2 - CMPQ SI, $0x00000003 + CMPQ DI, $0x00000003 JE longCore3 longCore3: - MOVQ (CX), SI - MOVQ $0x00000000802910e3, R9 - IMULQ R9, SI - ADDQ SI, BX + MOVQ (CX), DI + MOVQ $0x00000000802910e3, R10 + IMULQ R10, DI + ADDQ DI, BX ROLQ $0x39, BX XORQ AX, BX ADDQ $0x00000008, CX SUBQ $0x00000008, DX longCore2: - MOVQ (CX), SI - MOVQ $0x00000000819b13af, R9 - IMULQ R9, SI - ADDQ SI, BP - ROLQ $0x3f, BP - XORQ DI, BP + MOVQ (CX), DI + MOVQ $0x00000000819b13af, R10 + IMULQ R10, DI + ADDQ DI, SI + ROLQ $0x3f, SI + XORQ R8, SI ADDQ $0x00000008, CX SUBQ $0x00000008, DX longCore1: - MOVQ (CX), SI - MOVQ $0x0000000091cb27e5, R9 - IMULQ R9, SI - ADDQ SI, DI - RORQ $0x2f, DI - ADDQ BX, DI + MOVQ (CX), DI + MOVQ $0x0000000091cb27e5, R10 + IMULQ R10, DI + ADDQ DI, R8 + RORQ $0x2f, R8 + ADDQ BX, R8 ADDQ $0x00000008, CX SUBQ $0x00000008, DX longCore0: RORQ $0x0b, AX - SUBQ BP, AX - ADDQ $0x00000001, R8 - MOVQ $0x9c1b8e1e9628323f, SI - IMULQ SI, R8 - XORQ R8, BX + SUBQ SI, AX + ADDQ $0x00000001, R9 + MOVQ $0x9c1b8e1e9628323f, DI + IMULQ DI, R9 + XORQ R9, BX CMPQ DX, $0x00000000 JE longTail0 CMPQ DX, $0x00000001 @@ -263,22 +263,22 @@ longCore0: longTail7: MOVBQZX 6(CX), DX - ADDQ DX, BP + ADDQ DX, SI longTail6: MOVWQZX 4(CX), DX - ADDQ DX, DI + ADDQ DX, R8 MOVLQZX (CX), DX ADDQ DX, AX JMP longAfter longTail5: MOVBQZX 4(CX), DX - ADDQ DX, BP + ADDQ DX, SI longTail4: MOVLQZX (CX), DX - ADDQ DX, DI + ADDQ DX, R8 JMP longAfter longTail3: @@ -287,52 +287,52 @@ longTail3: longTail2: MOVWQZX (CX), DX - ADDQ DX, BP + ADDQ DX, SI JMP longAfter longTail1: MOVBQZX (CX), DX - ADDQ DX, DI + ADDQ DX, R8 longTail0: ROLQ $0x20, AX XORQ $0x000000ff, AX longAfter: - SUBQ DI, BP + SUBQ R8, SI RORQ $0x13, BX - SUBQ BX, BP - RORQ $0x35, BP - XORQ BP, AX + SUBQ BX, SI + RORQ $0x35, SI + XORQ SI, AX SUBQ AX, BX ROLQ $0x2b, AX ADDQ AX, BX RORQ $0x03, BX SUBQ BX, AX - RORQ $0x2b, DI - SUBQ AX, DI - ROLQ $0x37, DI - XORQ BX, DI - SUBQ DI, BP + RORQ $0x2b, R8 + SUBQ AX, R8 + ROLQ $0x37, R8 + XORQ BX, R8 + SUBQ R8, SI RORQ $0x07, AX - SUBQ DI, AX - RORQ $0x1f, DI - ADDQ DI, AX - SUBQ BP, DI + SUBQ R8, AX + RORQ $0x1f, R8 + ADDQ R8, AX + SUBQ SI, R8 RORQ $0x27, AX - XORQ AX, DI + XORQ AX, R8 RORQ $0x11, AX - XORQ DI, AX - ADDQ AX, BP - RORQ $0x09, BP - XORQ BP, DI - ROLQ $0x18, DI - XORQ DI, AX + XORQ R8, AX + ADDQ AX, SI + RORQ $0x09, SI + XORQ SI, R8 + ROLQ $0x18, R8 + XORQ R8, AX RORQ $0x3b, AX RORQ $0x01, BX - SUBQ BP, BX - XORQ BP, BX - XORQ AX, DI - XORQ DI, BX + SUBQ SI, BX + XORQ SI, BX + XORQ AX, R8 + XORQ R8, BX MOVQ BX, ret+32(FP) RET diff --git a/pass/reg.go b/pass/reg.go index 5df7f0d..6c19eb8 100644 --- a/pass/reg.go +++ b/pass/reg.go @@ -74,7 +74,7 @@ func Liveness(fn *ir.Function) error { // AllocateRegisters performs register allocation. func AllocateRegisters(fn *ir.Function) error { - // Populate allocators (one per kind). + // Initialize one allocator per kind. as := map[reg.Kind]*Allocator{} for _, i := range fn.Instructions() { for _, r := range i.Registers() { @@ -86,7 +86,28 @@ func AllocateRegisters(fn *ir.Function) error { } as[k] = a } - as[k].Add(r.ID()) + } + } + + // De-prioritize the base pointer register. This can be used as a general + // purpose register, but it's callee-save so needs to be saved/restored if + // it is clobbered. For this reason we prefer to avoid using it unless + // forced to by register pressure. + for k, a := range as { + f := reg.FamilyOfKind(k) + for _, r := range f.Registers() { + if (r.Info() & reg.BasePointer) != 0 { + // Negative priority penalizes this register relative to all + // others (having default zero priority). + a.SetPriority(r.ID(), -1) + } + } + } + + // Populate registers to be allocated. + for _, i := range fn.Instructions() { + for _, r := range i.Registers() { + as[r.Kind()].Add(r.ID()) } } diff --git a/pass/reg_test.go b/pass/reg_test.go index 1bb332d..a575f17 100644 --- a/pass/reg_test.go +++ b/pass/reg_test.go @@ -106,6 +106,51 @@ func ConstructLiveness(t *testing.T, ctx *build.Context) *ir.Function { return BuildFunction(t, ctx, pass.LabelTarget, pass.CFG, pass.Liveness) } +func TestAllocateRegistersBasePointerDeprioritized(t *testing.T) { + // Construct a function that requires n general-purpose registers all live + // at once. Choose n to be the maximal possible number of registers without + // touching the base pointer. + n := 14 + + ctx := build.NewContext() + ctx.Function("sum") + ctx.SignatureExpr("func() uint64") + + x := make([]reg.GPVirtual, n) + for i := 0; i < n; i++ { + x[i] = ctx.GP64() + ctx.MOVQ(operand.U64(i), x[i]) + } + + for i := 1; i < n; i++ { + ctx.ADDQ(x[i], x[0]) + } + + ctx.Store(x[0], ctx.ReturnIndex(0)) + ctx.RET() + + // Build and compile the function up to register allocation. + fn := BuildFunction(t, ctx, pass.LabelTarget, pass.CFG, pass.Liveness, pass.AllocateRegisters, pass.BindRegisters) + + // Verify this function uses n registers, but not the base pointer. + ps := map[reg.Physical]bool{} + for _, i := range fn.Instructions() { + for _, r := range i.OutputRegisters() { + ps[reg.ToPhysical(r)] = true + } + } + + if len(ps) != n { + t.Fatalf("expected function to require %d registers", n) + } + + for p := range ps { + if (p.Info() & reg.BasePointer) != 0 { + t.Fatal("base pointer used") + } + } +} + func TestEnsureBasePointerCalleeSavedFrameless(t *testing.T) { // Construct a function that writes to the base pointer. ctx := build.NewContext() diff --git a/tests/alloc/gp8/gp8.s b/tests/alloc/gp8/gp8.s index b2f25fc..a76e7de 100644 --- a/tests/alloc/gp8/gp8.s +++ b/tests/alloc/gp8/gp8.s @@ -8,17 +8,17 @@ TEXT ·GP8(SB), NOSPLIT, $8-1 MOVB $0x02, CL MOVB $0x03, DL MOVB $0x04, BL - MOVB $0x05, BP - MOVB $0x06, SI - MOVB $0x07, DI - MOVB $0x08, R8 - MOVB $0x09, R9 - MOVB $0x0a, R10 - MOVB $0x0b, R11 - MOVB $0x0c, R12 - MOVB $0x0d, R13 - MOVB $0x0e, R14 - MOVB $0x0f, R15 + MOVB $0x05, SI + MOVB $0x06, DI + MOVB $0x07, R8 + MOVB $0x08, R9 + MOVB $0x09, R10 + MOVB $0x0a, R11 + MOVB $0x0b, R12 + MOVB $0x0c, R13 + MOVB $0x0d, R14 + MOVB $0x0e, R15 + MOVB $0x0f, BP MOVB $0x10, AH MOVB $0x11, CH MOVB $0x12, DH @@ -26,7 +26,6 @@ TEXT ·GP8(SB), NOSPLIT, $8-1 ADDB CL, AL ADDB DL, AL ADDB BL, AL - ADDB BP, AL ADDB SI, AL ADDB DI, AL ADDB R8, AL @@ -37,6 +36,7 @@ TEXT ·GP8(SB), NOSPLIT, $8-1 ADDB R13, AL ADDB R14, AL ADDB R15, AL + ADDB BP, AL ADDB AH, AL ADDB CH, AL ADDB DH, AL diff --git a/tests/alloc/masks/masks.s b/tests/alloc/masks/masks.s index 0565dcc..c4c2a26 100644 --- a/tests/alloc/masks/masks.s +++ b/tests/alloc/masks/masks.s @@ -8,36 +8,35 @@ TEXT ·Masks(SB), NOSPLIT, $8-16 MOVQ $0x0002002a, CX MOVQ $0x0003002a, DX MOVQ $0x0004002a, BX - MOVQ $0x0005002a, BP - MOVQ $0x0006002a, SI - MOVQ $0x0007002a, DI - MOVQ $0x0008002a, R8 - MOVQ $0x0009002a, R9 - MOVQ $0x000a002a, R10 - MOVQ $0x000b002a, R11 - MOVQ $0x000c002a, R12 - MOVQ $0x000d002a, R13 - MOVQ $0x000e002a, R14 - MOVQ $0x000f002a, R15 + MOVQ $0x0005002a, SI + MOVQ $0x0006002a, DI + MOVQ $0x0007002a, R8 + MOVQ $0x0008002a, R9 + MOVQ $0x0009002a, R10 + MOVQ $0x000a002a, R11 + MOVQ $0x000b002a, R12 + MOVQ $0x000c002a, R13 + MOVQ $0x000d002a, R14 + MOVQ $0x000e002a, R15 + MOVQ $0x000f002a, BP MOVW $0x0001, AX MOVW $0x0002, CX MOVW $0x0003, DX MOVW $0x0004, BX - MOVW $0x0005, BP - MOVW $0x0006, SI - MOVW $0x0007, DI - MOVW $0x0008, R8 - MOVW $0x0009, R9 - MOVW $0x000a, R10 - MOVW $0x000b, R11 - MOVW $0x000c, R12 - MOVW $0x000d, R13 - MOVW $0x000e, R14 - MOVW $0x000f, R15 + MOVW $0x0005, SI + MOVW $0x0006, DI + MOVW $0x0007, R8 + MOVW $0x0008, R9 + MOVW $0x0009, R10 + MOVW $0x000a, R11 + MOVW $0x000b, R12 + MOVW $0x000c, R13 + MOVW $0x000d, R14 + MOVW $0x000e, R15 + MOVW $0x000f, BP ADDW CX, AX ADDW DX, AX ADDW BX, AX - ADDW BP, AX ADDW SI, AX ADDW DI, AX ADDW R8, AX @@ -48,12 +47,12 @@ TEXT ·Masks(SB), NOSPLIT, $8-16 ADDW R13, AX ADDW R14, AX ADDW R15, AX + ADDW BP, AX MOVW AX, ret+0(FP) MOVW $0x0000, AX MOVW $0x0000, CX MOVW $0x0000, DX MOVW $0x0000, BX - MOVW $0x0000, BP MOVW $0x0000, SI MOVW $0x0000, DI MOVW $0x0000, R8 @@ -64,10 +63,10 @@ TEXT ·Masks(SB), NOSPLIT, $8-16 MOVW $0x0000, R13 MOVW $0x0000, R14 MOVW $0x0000, R15 + MOVW $0x0000, BP ADDQ CX, AX ADDQ DX, AX ADDQ BX, AX - ADDQ BP, AX ADDQ SI, AX ADDQ DI, AX ADDQ R8, AX @@ -78,6 +77,7 @@ TEXT ·Masks(SB), NOSPLIT, $8-16 ADDQ R13, AX ADDQ R14, AX ADDQ R15, AX + ADDQ BP, AX SHRQ $0x10, AX MOVQ AX, ret1+8(FP) RET diff --git a/tests/alloc/upper32/upper32.s b/tests/alloc/upper32/upper32.s index d1ca59f..11e770c 100644 --- a/tests/alloc/upper32/upper32.s +++ b/tests/alloc/upper32/upper32.s @@ -11,7 +11,6 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8 MOVQ $0x9e77d78aacb8cbcc, CX MOVQ $0x9e77d78aacb8cbcc, DX MOVQ $0x9e77d78aacb8cbcc, BX - MOVQ $0x9e77d78aacb8cbcc, BP MOVQ $0x9e77d78aacb8cbcc, SI MOVQ $0x9e77d78aacb8cbcc, DI MOVQ $0x9e77d78aacb8cbcc, R8 @@ -22,10 +21,10 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8 MOVQ $0x9e77d78aacb8cbcc, R13 MOVQ $0x9e77d78aacb8cbcc, R14 MOVQ $0x9e77d78aacb8cbcc, R15 + MOVQ $0x9e77d78aacb8cbcc, BP MOVQ $0x9e77d78aacb8cbcc, CX MOVQ $0x9e77d78aacb8cbcc, DX MOVQ $0x9e77d78aacb8cbcc, BX - MOVQ $0x9e77d78aacb8cbcc, BP MOVQ $0x9e77d78aacb8cbcc, SI MOVQ $0x9e77d78aacb8cbcc, DI MOVQ $0x9e77d78aacb8cbcc, R8 @@ -36,10 +35,10 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8 MOVQ $0x9e77d78aacb8cbcc, R13 MOVQ $0x9e77d78aacb8cbcc, R14 MOVQ $0x9e77d78aacb8cbcc, R15 + MOVQ $0x9e77d78aacb8cbcc, BP MOVQ $0x9e77d78aacb8cbcc, CX MOVQ $0x9e77d78aacb8cbcc, DX MOVQ $0x9e77d78aacb8cbcc, BX - MOVQ $0x9e77d78aacb8cbcc, BP MOVQ $0x9e77d78aacb8cbcc, SI MOVQ $0x9e77d78aacb8cbcc, DI MOVQ $0x9e77d78aacb8cbcc, R8 @@ -50,26 +49,26 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8 MOVQ $0x9e77d78aacb8cbcc, R13 MOVQ $0x9e77d78aacb8cbcc, R14 MOVQ $0x9e77d78aacb8cbcc, R15 + MOVQ $0x9e77d78aacb8cbcc, BP // Iteration 1. MOVL $0x00000001, CX MOVL $0x00000002, DX MOVL $0x00000003, BX - MOVL $0x00000004, BP - MOVL $0x00000005, SI - MOVL $0x00000006, DI - MOVL $0x00000007, R8 - MOVL $0x00000008, R9 - MOVL $0x00000009, R10 - MOVL $0x0000000a, R11 - MOVL $0x0000000b, R12 - MOVL $0x0000000c, R13 - MOVL $0x0000000d, R14 - MOVL $0x0000000e, R15 + MOVL $0x00000004, SI + MOVL $0x00000005, DI + MOVL $0x00000006, R8 + MOVL $0x00000007, R9 + MOVL $0x00000008, R10 + MOVL $0x00000009, R11 + MOVL $0x0000000a, R12 + MOVL $0x0000000b, R13 + MOVL $0x0000000c, R14 + MOVL $0x0000000d, R15 + MOVL $0x0000000e, BP ADDQ CX, AX ADDQ DX, AX ADDQ BX, AX - ADDQ BP, AX ADDQ SI, AX ADDQ DI, AX ADDQ R8, AX @@ -80,26 +79,26 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8 ADDQ R13, AX ADDQ R14, AX ADDQ R15, AX + ADDQ BP, AX // Iteration 2. MOVL $0x0000000f, CX MOVL $0x00000010, DX MOVL $0x00000011, BX - MOVL $0x00000012, BP - MOVL $0x00000013, SI - MOVL $0x00000014, DI - MOVL $0x00000015, R8 - MOVL $0x00000016, R9 - MOVL $0x00000017, R10 - MOVL $0x00000018, R11 - MOVL $0x00000019, R12 - MOVL $0x0000001a, R13 - MOVL $0x0000001b, R14 - MOVL $0x0000001c, R15 + MOVL $0x00000012, SI + MOVL $0x00000013, DI + MOVL $0x00000014, R8 + MOVL $0x00000015, R9 + MOVL $0x00000016, R10 + MOVL $0x00000017, R11 + MOVL $0x00000018, R12 + MOVL $0x00000019, R13 + MOVL $0x0000001a, R14 + MOVL $0x0000001b, R15 + MOVL $0x0000001c, BP ADDQ CX, AX ADDQ DX, AX ADDQ BX, AX - ADDQ BP, AX ADDQ SI, AX ADDQ DI, AX ADDQ R8, AX @@ -110,26 +109,26 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8 ADDQ R13, AX ADDQ R14, AX ADDQ R15, AX + ADDQ BP, AX // Iteration 3. MOVL $0x0000001d, CX MOVL $0x0000001e, DX MOVL $0x0000001f, BX - MOVL $0x00000020, BP - MOVL $0x00000021, SI - MOVL $0x00000022, DI - MOVL $0x00000023, R8 - MOVL $0x00000024, R9 - MOVL $0x00000025, R10 - MOVL $0x00000026, R11 - MOVL $0x00000027, R12 - MOVL $0x00000028, R13 - MOVL $0x00000029, R14 - MOVL $0x0000002a, R15 + MOVL $0x00000020, SI + MOVL $0x00000021, DI + MOVL $0x00000022, R8 + MOVL $0x00000023, R9 + MOVL $0x00000024, R10 + MOVL $0x00000025, R11 + MOVL $0x00000026, R12 + MOVL $0x00000027, R13 + MOVL $0x00000028, R14 + MOVL $0x00000029, R15 + MOVL $0x0000002a, BP ADDQ CX, AX ADDQ DX, AX ADDQ BX, AX - ADDQ BP, AX ADDQ SI, AX ADDQ DI, AX ADDQ R8, AX @@ -140,6 +139,7 @@ TEXT ·Upper32(SB), NOSPLIT, $8-8 ADDQ R13, AX ADDQ R14, AX ADDQ R15, AX + ADDQ BP, AX // Store result and return. MOVQ AX, ret+0(FP) diff --git a/tests/fixedbugs/issue100/allocfail/allocfail.s b/tests/fixedbugs/issue100/allocfail/allocfail.s index e168c9e..68f9352 100644 --- a/tests/fixedbugs/issue100/allocfail/allocfail.s +++ b/tests/fixedbugs/issue100/allocfail/allocfail.s @@ -41,224 +41,224 @@ zero_loop_encodeBlockAsm: MOVQ src_base+24(FP), CX search_loop_encodeBlockAsm: - MOVQ (CX)(AX*1), BP + MOVQ (CX)(AX*1), SI MOVL AX, BX SUBL 20(SP), BX SHRL $0x06, BX LEAQ 4(AX)(BX*1), BX - MOVL 16(SP), SI - CMPL BX, SI + MOVL 16(SP), DI + CMPL BX, DI JGT emit_remainder_encodeBlockAsm MOVL BX, 28(SP) MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ BP, DI - MOVQ BP, R8 - SHRQ $0x08, R8 - SHLQ $0x10, DI - IMULQ BX, DI - SHRQ $0x30, DI + MOVQ SI, R8 + MOVQ SI, R9 + SHRQ $0x08, R9 SHLQ $0x10, R8 IMULQ BX, R8 SHRQ $0x30, R8 - MOVL 32(SP)(DI*1), BX - MOVL 32(SP)(R8*1), SI - MOVL AX, 32(SP)(DI*1) - LEAL 1(AX), DI - MOVL DI, 32(SP)(R8*1) - MOVL AX, DI - SUBL 24(SP), DI - MOVL 1(CX)(DI*1), R9 - MOVQ BP, R8 - SHLQ $0x08, R8 - CMPL R8, R9 + SHLQ $0x10, R9 + IMULQ BX, R9 + SHRQ $0x30, R9 + MOVL 32(SP)(R8*1), BX + MOVL 32(SP)(R9*1), DI + MOVL AX, 32(SP)(R8*1) + LEAL 1(AX), R8 + MOVL R8, 32(SP)(R9*1) + MOVL AX, R8 + SUBL 24(SP), R8 + MOVL 1(CX)(R8*1), R10 + MOVQ SI, R9 + SHLQ $0x08, R9 + CMPL R9, R10 JNE no_repeat_found_encodeBlockAsm - LEAQ 1(AX), BP + LEAQ 1(AX), SI MOVL 20(SP), BX - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_back_end_encodeBlockAsm repeat_extend_back_loop_encodeBlockAsm: - CMPL BP, BX + CMPL SI, BX JG repeat_extend_back_end_encodeBlockAsm - MOVB -1(CX)(DI*1), DL - MOVB -1(CX)(BP*1), SI - CMPB DL, SI + MOVB -1(CX)(R8*1), DL + MOVB -1(CX)(SI*1), DI + CMPB DL, DI JNE repeat_extend_back_end_encodeBlockAsm - LEAQ -1(BP), BP - DECL DI + LEAQ -1(SI), SI + DECL R8 JZ repeat_extend_back_end_encodeBlockAsm JMP repeat_extend_back_loop_encodeBlockAsm repeat_extend_back_end_encodeBlockAsm: MOVL 20(SP), BX - CMPL BX, BP + CMPL BX, SI JEQ emit_literal_skip_repeat_emit_encodeBlockAsm - MOVL BP, SI - MOVL BP, 20(SP) - LEAQ (CX)(BX*1), DI - SUBL BX, SI + MOVL SI, DI + MOVL SI, 20(SP) + LEAQ (CX)(BX*1), R8 + SUBL BX, DI MOVQ dst_base+0(FP), BX - MOVQ SI, R8 - SUBL $0x01, R8 + MOVQ DI, R9 + SUBL $0x01, R9 JC emit_literal_done_repeat_emit_encodeBlockAsm - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_repeat_emit_encodeBlockAsm MOVB $0xfc, (BX) - MOVL R8, 1(BX) + MOVL R9, 1(BX) ADDQ $0x05, BX JMP memmove_repeat_emit_encodeBlockAsm four_bytes_repeat_emit_encodeBlockAsm: - MOVQ R8, R9 - SHRL $0x10, R9 + MOVQ R9, R10 + SHRL $0x10, R10 MOVB $0xf8, (BX) - MOVW R8, 1(BX) - MOVB R9, 3(BX) + MOVW R9, 1(BX) + MOVB R10, 3(BX) ADDQ $0x04, BX JMP memmove_repeat_emit_encodeBlockAsm three_bytes_repeat_emit_encodeBlockAsm: MOVB $0xf4, (BX) - MOVW R8, 1(BX) + MOVW R9, 1(BX) ADDQ $0x03, BX JMP memmove_repeat_emit_encodeBlockAsm two_bytes_repeat_emit_encodeBlockAsm: MOVB $0xf0, (BX) - MOVB R8, 1(BX) + MOVB R9, 1(BX) ADDQ $0x02, BX JMP memmove_repeat_emit_encodeBlockAsm one_byte_repeat_emit_encodeBlockAsm: - SHLB $0x02, R8 - MOVB R8, (BX) + SHLB $0x02, R9 + MOVB R9, (BX) ADDQ $0x01, BX memmove_repeat_emit_encodeBlockAsm: - LEAQ (BX)(SI*1), R8 + LEAQ (BX)(DI*1), R9 NOP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_tail: - TESTQ SI, SI + TESTQ DI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm - CMPQ SI, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2 - CMPQ SI, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4 - CMPQ SI, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 - CMPQ SI, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_9through16 - CMPQ SI, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 - CMPQ SI, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 - CMPQ SI, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_65through128 - CMPQ SI, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2: - MOVB (DI), R8 - MOVB -1(DI)(SI*1), DI - MOVB R8, (BX) - MOVB DI, -1(BX)(SI*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (BX) + MOVB R8, -1(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4: - MOVL (DI), R8 - MOVL R8, (BX) + MOVL (R8), R9 + MOVL R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3: - MOVW (DI), R8 - MOVB 2(DI), DI - MOVW R8, (BX) - MOVB DI, 2(BX) + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (BX) + MOVB R8, 2(BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_5through7: - MOVL (DI), R8 - MOVL -4(DI)(SI*1), DI - MOVL R8, (BX) - MOVL DI, -4(BX)(SI*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (BX) + MOVL R8, -4(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: - MOVQ (DI), R8 - MOVQ R8, (BX) + MOVQ (R8), R9 + MOVQ R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_9through16: - MOVQ (DI), R8 - MOVQ -8(DI)(SI*1), DI - MOVQ R8, (BX) - MOVQ DI, -8(BX)(SI*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (BX) + MOVQ R8, -8(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(SI*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (BX) - MOVOU X1, -16(BX)(SI*1) + MOVOU X1, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(SI*1), X2 - MOVOU -16(DI)(SI*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (BX) MOVOU X1, 16(BX) - MOVOU X2, -32(BX)(SI*1) - MOVOU X3, -16(BX)(SI*1) + MOVOU X2, -32(BX)(DI*1) + MOVOU X3, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_65through128: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) MOVOU X3, 48(BX) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU -128(DI)(SI*1), X8 - MOVOU -112(DI)(SI*1), X9 - MOVOU -96(DI)(SI*1), X10 - MOVOU -80(DI)(SI*1), X11 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -267,34 +267,34 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256: MOVOU X5, 80(BX) MOVOU X6, 96(BX) MOVOU X7, 112(BX) - MOVOU X8, -128(BX)(SI*1) - MOVOU X9, -112(BX)(SI*1) - MOVOU X10, -96(BX)(SI*1) - MOVOU X11, -80(BX)(SI*1) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X8, -128(BX)(DI*1) + MOVOU X9, -112(BX)(DI*1) + MOVOU X10, -96(BX)(DI*1) + MOVOU X11, -80(BX)(DI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048: - LEAQ -256(SI), SI - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU 128(DI), X8 - MOVOU 144(DI), X9 - MOVOU 160(DI), X10 - MOVOU 176(DI), X11 - MOVOU 192(DI), X12 - MOVOU 208(DI), X13 - MOVOU 224(DI), X14 - MOVOU 240(DI), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -311,12 +311,12 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048: MOVOU X13, 208(BX) MOVOU X14, 224(BX) MOVOU X15, 240(BX) - CMPQ SI, $0x00000100 - LEAQ 256(DI), DI + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(BX), BX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_tail - MOVQ R8, BX + MOVQ R9, BX emit_literal_done_repeat_emit_encodeBlockAsm: MOVQ BX, dst_base+0(FP) @@ -327,23 +327,23 @@ emit_literal_skip_repeat_emit_encodeBlockAsm: SUBL 24(SP), BX MOVL 16(SP), BX SUBL AX, BX - XORQ DI, DI + XORQ R8, R8 CMPQ BX, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_repeat_extend - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP repeat_extend_forward_end_encodeBlockAsm matchlen_loop_repeat_extend: LEAQ -8(BX), BX - LEAQ 8(DI), DI + LEAQ 8(R8), R8 CMPQ BX, $0x08 JGE matchlen_loopback_repeat_extend @@ -352,31 +352,31 @@ matchlen_single_repeat_extend: JZ repeat_extend_forward_end_encodeBlockAsm matchlen_single_loopback_repeat_extend: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE repeat_extend_forward_end_encodeBlockAsm - LEAQ 1(DI), DI + LEAQ 1(R8), R8 DECQ BX JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm: - ADDL DI, AX + ADDL R8, AX MOVL AX, BX - SUBL BP, BX - MOVL 24(SP), BP - MOVQ dst_base+0(FP), SI - MOVL 20(SP), DI - TESTL DI, DI + SUBL SI, BX + MOVL 24(SP), SI + MOVQ dst_base+0(FP), DI + MOVL 20(SP), R8 + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm emit_repeat_again_match_repeat_: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_match_repeat_ - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_ - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_ cant_repeat_two_offset_match_repeat_: @@ -387,74 +387,74 @@ cant_repeat_two_offset_match_repeat_: CMPL BX, $0x0100ffff JLT repeat_five_match_repeat_ LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_repeat_ repeat_five_match_repeat_: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm repeat_four_match_repeat_: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm repeat_three_match_repeat_: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm repeat_two_match_repeat_: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_match_repeat_: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm repeat_as_copy_encodeBlockAsm: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm - MOVB $0xff, (SI) - MOVD BP, 1(SI) + MOVB $0xff, (DI) + MOVD SI, 1(DI) LEAQ -64(BX), BX - ADDQ $0x05, SI + ADDQ $0x05, DI CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: @@ -465,52 +465,52 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm four_bytes_remain_repeat_as_copy_encodeBlockAsm: @@ -518,27 +518,27 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm: JZ repeat_end_emit_encodeBlockAsm MOVB $0x03, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + MOVB BL, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm two_byte_offset_repeat_as_copy_encodeBlockAsm: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm - MOVB $0xee, (SI) - MOVW BP, 1(SI) + MOVB $0xee, (DI) + MOVW SI, 1(DI) LEAQ -60(BX), BX - ADDQ $0x03, SI + ADDQ $0x03, DI emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: @@ -549,100 +549,100 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm two_byte_offset_short_repeat_as_copy_encodeBlockAsm: CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm MOVB $0x01, DL LEAQ -16(DX)(BX*4), BX - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm emit_copy_three_repeat_as_copy_encodeBlockAsm: MOVB $0x02, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + MOVB BL, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI repeat_end_emit_encodeBlockAsm: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL 16(SP), BX CMPL AX, BX JGT emit_remainder_encodeBlockAsm JMP search_loop_encodeBlockAsm no_repeat_found_encodeBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ BP, DI - SHRQ $0x10, DI - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x30, DI - CMPL (CX)(BX*1), BP - SHRQ $0x08, BP + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ SI, R8 + SHRQ $0x10, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x30, R8 + CMPL (CX)(BX*1), SI + SHRQ $0x08, SI JEQ candidate_match_encodeBlockAsm - MOVL 32(SP)(DI*1), BX - CMPL (CX)(SI*1), BP + MOVL 32(SP)(R8*1), BX + CMPL (CX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm - LEAQ 2(AX), SI - MOVL SI, 32(SP)(DI*1) - SHRQ $0x08, BP - CMPL (CX)(BX*1), BP + LEAQ 2(AX), DI + MOVL DI, 32(SP)(R8*1) + SHRQ $0x08, SI + CMPL (CX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm MOVL 28(SP), AX JMP search_loop_encodeBlockAsm @@ -653,21 +653,21 @@ candidate3_match_encodeBlockAsm: candidate2_match_encodeBlockAsm: LEAQ -2(AX), BX - MOVL BX, 32(SP)(DI*1) + MOVL BX, 32(SP)(R8*1) INCL AX - MOVL SI, BX + MOVL DI, BX candidate_match_encodeBlockAsm: - MOVL 20(SP), BP + MOVL 20(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm match_extend_back_loop_encodeBlockAsm: - CMPL AX, BP + CMPL AX, SI JG match_extend_back_end_encodeBlockAsm MOVB -1(CX)(BX*1), DL - MOVB -1(CX)(AX*1), SI - CMPB DL, SI + MOVB -1(CX)(AX*1), DI + CMPB DL, DI JNE match_extend_back_end_encodeBlockAsm LEAL -1(AX), AX DECL BX @@ -675,507 +675,507 @@ match_extend_back_loop_encodeBlockAsm: JMP match_extend_back_loop_encodeBlockAsm match_extend_back_end_encodeBlockAsm: - MOVL AX, BP - SUBL 20(SP), BP - LEAQ dst_base+0(FP)(BP*1), BP - CMPQ BP, (SP) + MOVL AX, SI + SUBL 20(SP), SI + LEAQ dst_base+0(FP)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm: - MOVL BX, BP - MOVL 20(SP), SI - CMPL SI, BP + MOVL BX, SI + MOVL 20(SP), DI + CMPL DI, SI JEQ emit_literal_skip_match_emit_encodeBlockAsm - MOVL BP, DI - MOVL BP, 20(SP) - LEAQ (CX)(SI*1), BP - SUBL SI, DI - MOVQ dst_base+0(FP), SI - MOVQ DI, R8 - SUBL $0x01, R8 + MOVL SI, R8 + MOVL SI, 20(SP) + LEAQ (CX)(DI*1), SI + SUBL DI, R8 + MOVQ dst_base+0(FP), DI + MOVQ R8, R9 + SUBL $0x01, R9 JC emit_literal_done_match_emit_encodeBlockAsm - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_match_emit_encodeBlockAsm - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsm - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_match_emit_encodeBlockAsm - MOVB $0xfc, (SI) - MOVL R8, 1(SI) - ADDQ $0x05, SI + MOVB $0xfc, (DI) + MOVL R9, 1(DI) + ADDQ $0x05, DI JMP memmove_match_emit_encodeBlockAsm four_bytes_match_emit_encodeBlockAsm: - MOVQ R8, R9 - SHRL $0x10, R9 - MOVB $0xf8, (SI) - MOVW R8, 1(SI) - MOVB R9, 3(SI) - ADDQ $0x04, SI + MOVQ R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (DI) + MOVW R9, 1(DI) + MOVB R10, 3(DI) + ADDQ $0x04, DI JMP memmove_match_emit_encodeBlockAsm three_bytes_match_emit_encodeBlockAsm: - MOVB $0xf4, (SI) - MOVW R8, 1(SI) - ADDQ $0x03, SI + MOVB $0xf4, (DI) + MOVW R9, 1(DI) + ADDQ $0x03, DI JMP memmove_match_emit_encodeBlockAsm two_bytes_match_emit_encodeBlockAsm: - MOVB $0xf0, (SI) - MOVB R8, 1(SI) - ADDQ $0x02, SI + MOVB $0xf0, (DI) + MOVB R9, 1(DI) + ADDQ $0x02, DI JMP memmove_match_emit_encodeBlockAsm one_byte_match_emit_encodeBlockAsm: - SHLB $0x02, R8 - MOVB R8, (SI) - ADDQ $0x01, SI + SHLB $0x02, R9 + MOVB R9, (DI) + ADDQ $0x01, DI memmove_match_emit_encodeBlockAsm: - LEAQ (SI)(DI*1), R8 + LEAQ (DI)(R8*1), R9 NOP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_tail: - TESTQ DI, DI + TESTQ R8, R8 JEQ emit_literal_done_match_emit_encodeBlockAsm - CMPQ DI, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2 - CMPQ DI, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4 - CMPQ DI, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_5through7 JE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 - CMPQ DI, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_9through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 - CMPQ DI, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 - CMPQ DI, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_65through128 - CMPQ DI, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_129through256 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2: - MOVB (BP), R8 - MOVB -1(BP)(DI*1), BP - MOVB R8, (SI) - MOVB BP, -1(SI)(DI*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (DI) + MOVB SI, -1(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4: - MOVL (BP), R8 - MOVL R8, (SI) + MOVL (SI), R9 + MOVL R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3: - MOVW (BP), R8 - MOVB 2(BP), BP - MOVW R8, (SI) - MOVB BP, 2(SI) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (DI) + MOVB SI, 2(DI) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_5through7: - MOVL (BP), R8 - MOVL -4(BP)(DI*1), BP - MOVL R8, (SI) - MOVL BP, -4(SI)(DI*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (DI) + MOVL SI, -4(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: - MOVQ (BP), R8 - MOVQ R8, (SI) + MOVQ (SI), R9 + MOVQ R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_9through16: - MOVQ (BP), R8 - MOVQ -8(BP)(DI*1), BP - MOVQ R8, (SI) - MOVQ BP, -8(SI)(DI*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (DI) + MOVQ SI, -8(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (BP), X0 - MOVOU -16(BP)(DI*1), X1 - MOVOU X0, (SI) - MOVOU X1, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (DI) + MOVOU X1, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU -32(BP)(DI*1), X2 - MOVOU -16(BP)(DI*1), X3 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, -32(SI)(DI*1) - MOVOU X3, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, -32(DI)(R8*1) + MOVOU X3, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_65through128: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_129through256: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU -128(BP)(DI*1), X8 - MOVOU -112(BP)(DI*1), X9 - MOVOU -96(BP)(DI*1), X10 - MOVOU -80(BP)(DI*1), X11 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, -128(SI)(DI*1) - MOVOU X9, -112(SI)(DI*1) - MOVOU X10, -96(SI)(DI*1) - MOVOU X11, -80(SI)(DI*1) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, -128(DI)(R8*1) + MOVOU X9, -112(DI)(R8*1) + MOVOU X10, -96(DI)(R8*1) + MOVOU X11, -80(DI)(R8*1) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048: - LEAQ -256(DI), DI - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU 128(BP), X8 - MOVOU 144(BP), X9 - MOVOU 160(BP), X10 - MOVOU 176(BP), X11 - MOVOU 192(BP), X12 - MOVOU 208(BP), X13 - MOVOU 224(BP), X14 - MOVOU 240(BP), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, 128(SI) - MOVOU X9, 144(SI) - MOVOU X10, 160(SI) - MOVOU X11, 176(SI) - MOVOU X12, 192(SI) - MOVOU X13, 208(SI) - MOVOU X14, 224(SI) - MOVOU X15, 240(SI) - CMPQ DI, $0x00000100 - LEAQ 256(BP), BP + LEAQ -256(R8), R8 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, 128(DI) + MOVOU X9, 144(DI) + MOVOU X10, 160(DI) + MOVOU X11, 176(DI) + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + CMPQ R8, $0x00000100 LEAQ 256(SI), SI + LEAQ 256(DI), DI JGE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_tail - MOVQ R8, SI + MOVQ R9, DI emit_literal_done_match_emit_encodeBlockAsm: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) emit_literal_skip_match_emit_encodeBlockAsm: NOP match_nolit_loop_encodeBlockAsm: - MOVL AX, BP - MOVL AX, BP - SUBL BX, BP - MOVL BP, 24(SP) + MOVL AX, SI + MOVL AX, SI + SUBL BX, SI + MOVL SI, 24(SP) ADDL $0x04, AX ADDL $0x04, BX - MOVL 16(SP), BP - SUBL AX, BP - XORQ DI, DI - CMPQ BP, $0x08 + MOVL 16(SP), SI + SUBL AX, SI + XORQ R8, R8 + CMPQ SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm matchlen_loopback_match_nolit_encodeBlockAsm: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_match_nolit_encodeBlockAsm - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP match_nolit_end_encodeBlockAsm matchlen_loop_match_nolit_encodeBlockAsm: - LEAQ -8(BP), BP - LEAQ 8(DI), DI - CMPQ BP, $0x08 + LEAQ -8(SI), SI + LEAQ 8(R8), R8 + CMPQ SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm matchlen_single_match_nolit_encodeBlockAsm: - TESTQ BP, BP + TESTQ SI, SI JZ match_nolit_end_encodeBlockAsm matchlen_single_loopback_match_nolit_encodeBlockAsm: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE match_nolit_end_encodeBlockAsm - LEAQ 1(DI), DI - DECQ BP + LEAQ 1(R8), R8 + DECQ SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm match_nolit_end_encodeBlockAsm: - MOVL 24(SP), BP - ADDQ $0x04, DI - MOVQ dst_base+0(FP), SI - ADDL DI, AX - CMPL BP, $0x00010000 + MOVL 24(SP), SI + ADDQ $0x04, R8 + MOVQ dst_base+0(FP), DI + ADDL R8, AX + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm - CMPL DI, $0x40 + CMPL R8, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm - MOVB $0xff, (SI) - MOVD BP, 1(SI) - LEAQ -64(DI), DI - ADDQ $0x05, SI - CMPL DI, $0x04 + MOVB $0xff, (DI) + MOVD SI, 1(DI) + LEAQ -64(R8), R8 + ADDQ $0x05, DI + CMPL R8, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy repeat_five_match_nolit_encodeBlockAsm_emit_copy: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm four_bytes_remain_match_nolit_encodeBlockAsm: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_emitcopy_end_encodeBlockAsm MOVB $0x03, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_match_nolit_encodeBlockAsm: - CMPL DI, $0x40 + CMPL R8, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm - MOVB $0xee, (SI) - MOVW BP, 1(SI) - LEAQ -60(DI), DI - ADDQ $0x03, SI + MOVB $0xee, (DI) + MOVW SI, 1(DI) + LEAQ -60(R8), R8 + ADDQ $0x03, DI emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_short_match_nolit_encodeBlockAsm: - CMPL DI, $0x0c + CMPL R8, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm MOVB $0x01, DL - LEAQ -16(DX)(DI*4), DI - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + LEAQ -16(DX)(R8*4), R8 + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy_three_match_nolit_encodeBlockAsm: MOVB $0x02, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI match_nolit_emitcopy_end_encodeBlockAsm: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL AX, 20(SP) CMPL AX, 16(SP) JGE emit_remainder_encodeBlockAsm - CMPQ SI, (SP) + CMPQ DI, (SP) JL match_nolit_dst_ok_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm: - MOVQ -2(CX)(AX*1), BP - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ BP, DI - SHRQ $0x10, BP - MOVQ BP, R8 - SHLQ $0x10, DI - IMULQ SI, DI - SHRQ $0x30, DI + MOVQ -2(CX)(AX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ SI, R8 + SHRQ $0x10, SI + MOVQ SI, R9 SHLQ $0x10, R8 - IMULQ SI, R8 + IMULQ DI, R8 SHRQ $0x30, R8 - MOVL 32(SP)(DI*1), SI - MOVL 32(SP)(R8*1), SI - LEAQ -2(AX), SI - MOVL SI, 32(SP)(DI*1) - MOVL AX, 32(SP)(R8*1) - CMPL (CX)(R8*1), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x30, R9 + MOVL 32(SP)(R8*1), DI + MOVL 32(SP)(R9*1), DI + LEAQ -2(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, 32(SP)(R9*1) + CMPL (CX)(R9*1), SI JEQ match_nolit_loop_encodeBlockAsm INCL AX JMP search_loop_encodeBlockAsm @@ -1217,11 +1217,11 @@ emit_remainder_ok_encodeBlockAsm: JMP memmove_emit_remainder_encodeBlockAsm four_bytes_emit_remainder_encodeBlockAsm: - MOVQ DX, BP - SHRL $0x10, BP + MOVQ DX, SI + SHRL $0x10, SI MOVB $0xf8, (CX) MOVW DX, 1(CX) - MOVB BP, 3(CX) + MOVB SI, 3(CX) ADDQ $0x04, CX JMP memmove_emit_remainder_encodeBlockAsm @@ -1464,224 +1464,224 @@ zero_loop_encodeBlockAsm14B: MOVQ src_base+24(FP), CX search_loop_encodeBlockAsm14B: - MOVQ (CX)(AX*1), BP + MOVQ (CX)(AX*1), SI MOVL AX, BX SUBL 20(SP), BX SHRL $0x05, BX LEAQ 4(AX)(BX*1), BX - MOVL 16(SP), SI - CMPL BX, SI + MOVL 16(SP), DI + CMPL BX, DI JGT emit_remainder_encodeBlockAsm14B MOVL BX, 28(SP) MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ BP, DI - MOVQ BP, R8 - SHRQ $0x08, R8 - SHLQ $0x10, DI - IMULQ BX, DI - SHRQ $0x32, DI + MOVQ SI, R8 + MOVQ SI, R9 + SHRQ $0x08, R9 SHLQ $0x10, R8 IMULQ BX, R8 SHRQ $0x32, R8 - MOVL 32(SP)(DI*1), BX - MOVL 32(SP)(R8*1), SI - MOVL AX, 32(SP)(DI*1) - LEAL 1(AX), DI - MOVL DI, 32(SP)(R8*1) - MOVL AX, DI - SUBL 24(SP), DI - MOVL 1(CX)(DI*1), R9 - MOVQ BP, R8 - SHLQ $0x08, R8 - CMPL R8, R9 + SHLQ $0x10, R9 + IMULQ BX, R9 + SHRQ $0x32, R9 + MOVL 32(SP)(R8*1), BX + MOVL 32(SP)(R9*1), DI + MOVL AX, 32(SP)(R8*1) + LEAL 1(AX), R8 + MOVL R8, 32(SP)(R9*1) + MOVL AX, R8 + SUBL 24(SP), R8 + MOVL 1(CX)(R8*1), R10 + MOVQ SI, R9 + SHLQ $0x08, R9 + CMPL R9, R10 JNE no_repeat_found_encodeBlockAsm14B - LEAQ 1(AX), BP + LEAQ 1(AX), SI MOVL 20(SP), BX - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_back_end_encodeBlockAsm14B repeat_extend_back_loop_encodeBlockAsm14B: - CMPL BP, BX + CMPL SI, BX JG repeat_extend_back_end_encodeBlockAsm14B - MOVB -1(CX)(DI*1), DL - MOVB -1(CX)(BP*1), SI - CMPB DL, SI + MOVB -1(CX)(R8*1), DL + MOVB -1(CX)(SI*1), DI + CMPB DL, DI JNE repeat_extend_back_end_encodeBlockAsm14B - LEAQ -1(BP), BP - DECL DI + LEAQ -1(SI), SI + DECL R8 JZ repeat_extend_back_end_encodeBlockAsm14B JMP repeat_extend_back_loop_encodeBlockAsm14B repeat_extend_back_end_encodeBlockAsm14B: MOVL 20(SP), BX - CMPL BX, BP + CMPL BX, SI JEQ emit_literal_skip_repeat_emit_encodeBlockAsm14B - MOVL BP, SI - MOVL BP, 20(SP) - LEAQ (CX)(BX*1), DI - SUBL BX, SI + MOVL SI, DI + MOVL SI, 20(SP) + LEAQ (CX)(BX*1), R8 + SUBL BX, DI MOVQ dst_base+0(FP), BX - MOVQ SI, R8 - SUBL $0x01, R8 + MOVQ DI, R9 + SUBL $0x01, R9 JC emit_literal_done_repeat_emit_encodeBlockAsm14B - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm14B - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm14B - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm14B - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_repeat_emit_encodeBlockAsm14B MOVB $0xfc, (BX) - MOVL R8, 1(BX) + MOVL R9, 1(BX) ADDQ $0x05, BX JMP memmove_repeat_emit_encodeBlockAsm14B four_bytes_repeat_emit_encodeBlockAsm14B: - MOVQ R8, R9 - SHRL $0x10, R9 + MOVQ R9, R10 + SHRL $0x10, R10 MOVB $0xf8, (BX) - MOVW R8, 1(BX) - MOVB R9, 3(BX) + MOVW R9, 1(BX) + MOVB R10, 3(BX) ADDQ $0x04, BX JMP memmove_repeat_emit_encodeBlockAsm14B three_bytes_repeat_emit_encodeBlockAsm14B: MOVB $0xf4, (BX) - MOVW R8, 1(BX) + MOVW R9, 1(BX) ADDQ $0x03, BX JMP memmove_repeat_emit_encodeBlockAsm14B two_bytes_repeat_emit_encodeBlockAsm14B: MOVB $0xf0, (BX) - MOVB R8, 1(BX) + MOVB R9, 1(BX) ADDQ $0x02, BX JMP memmove_repeat_emit_encodeBlockAsm14B one_byte_repeat_emit_encodeBlockAsm14B: - SHLB $0x02, R8 - MOVB R8, (BX) + SHLB $0x02, R9 + MOVB R9, (BX) ADDQ $0x01, BX memmove_repeat_emit_encodeBlockAsm14B: - LEAQ (BX)(SI*1), R8 + LEAQ (BX)(DI*1), R9 NOP emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_tail: - TESTQ SI, SI + TESTQ DI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm14B - CMPQ SI, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_1or2 - CMPQ SI, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_4 - CMPQ SI, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_8 - CMPQ SI, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_9through16 - CMPQ SI, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_17through32 - CMPQ SI, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_33through64 - CMPQ SI, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_65through128 - CMPQ SI, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_1or2: - MOVB (DI), R8 - MOVB -1(DI)(SI*1), DI - MOVB R8, (BX) - MOVB DI, -1(BX)(SI*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (BX) + MOVB R8, -1(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_4: - MOVL (DI), R8 - MOVL R8, (BX) + MOVL (R8), R9 + MOVL R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_3: - MOVW (DI), R8 - MOVB 2(DI), DI - MOVW R8, (BX) - MOVB DI, 2(BX) + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (BX) + MOVB R8, 2(BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_5through7: - MOVL (DI), R8 - MOVL -4(DI)(SI*1), DI - MOVL R8, (BX) - MOVL DI, -4(BX)(SI*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (BX) + MOVL R8, -4(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_8: - MOVQ (DI), R8 - MOVQ R8, (BX) + MOVQ (R8), R9 + MOVQ R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_9through16: - MOVQ (DI), R8 - MOVQ -8(DI)(SI*1), DI - MOVQ R8, (BX) - MOVQ DI, -8(BX)(SI*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (BX) + MOVQ R8, -8(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(SI*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (BX) - MOVOU X1, -16(BX)(SI*1) + MOVOU X1, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(SI*1), X2 - MOVOU -16(DI)(SI*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (BX) MOVOU X1, 16(BX) - MOVOU X2, -32(BX)(SI*1) - MOVOU X3, -16(BX)(SI*1) + MOVOU X2, -32(BX)(DI*1) + MOVOU X3, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_65through128: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) MOVOU X3, 48(BX) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_129through256: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU -128(DI)(SI*1), X8 - MOVOU -112(DI)(SI*1), X9 - MOVOU -96(DI)(SI*1), X10 - MOVOU -80(DI)(SI*1), X11 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -1690,34 +1690,34 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_129through256: MOVOU X5, 80(BX) MOVOU X6, 96(BX) MOVOU X7, 112(BX) - MOVOU X8, -128(BX)(SI*1) - MOVOU X9, -112(BX)(SI*1) - MOVOU X10, -96(BX)(SI*1) - MOVOU X11, -80(BX)(SI*1) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X8, -128(BX)(DI*1) + MOVOU X9, -112(BX)(DI*1) + MOVOU X10, -96(BX)(DI*1) + MOVOU X11, -80(BX)(DI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14B emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_256through2048: - LEAQ -256(SI), SI - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU 128(DI), X8 - MOVOU 144(DI), X9 - MOVOU 160(DI), X10 - MOVOU 176(DI), X11 - MOVOU 192(DI), X12 - MOVOU 208(DI), X13 - MOVOU 224(DI), X14 - MOVOU 240(DI), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -1734,12 +1734,12 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_256through2048: MOVOU X13, 208(BX) MOVOU X14, 224(BX) MOVOU X15, 240(BX) - CMPQ SI, $0x00000100 - LEAQ 256(DI), DI + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(BX), BX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm14B_memmove_tail - MOVQ R8, BX + MOVQ R9, BX emit_literal_done_repeat_emit_encodeBlockAsm14B: MOVQ BX, dst_base+0(FP) @@ -1750,23 +1750,23 @@ emit_literal_skip_repeat_emit_encodeBlockAsm14B: SUBL 24(SP), BX MOVL 16(SP), BX SUBL AX, BX - XORQ DI, DI + XORQ R8, R8 CMPQ BX, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_repeat_extend - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP repeat_extend_forward_end_encodeBlockAsm14B matchlen_loop_repeat_extend: LEAQ -8(BX), BX - LEAQ 8(DI), DI + LEAQ 8(R8), R8 CMPQ BX, $0x08 JGE matchlen_loopback_repeat_extend @@ -1775,31 +1775,31 @@ matchlen_single_repeat_extend: JZ repeat_extend_forward_end_encodeBlockAsm14B matchlen_single_loopback_repeat_extend: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE repeat_extend_forward_end_encodeBlockAsm14B - LEAQ 1(DI), DI + LEAQ 1(R8), R8 DECQ BX JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm14B: - ADDL DI, AX + ADDL R8, AX MOVL AX, BX - SUBL BP, BX - MOVL 24(SP), BP - MOVQ dst_base+0(FP), SI - MOVL 20(SP), DI - TESTL DI, DI + SUBL SI, BX + MOVL 24(SP), SI + MOVQ dst_base+0(FP), DI + MOVL 20(SP), R8 + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm14B emit_repeat_again_match_repeat_: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_match_repeat_ - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_ - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_ cant_repeat_two_offset_match_repeat_: @@ -1810,74 +1810,74 @@ cant_repeat_two_offset_match_repeat_: CMPL BX, $0x0100ffff JLT repeat_five_match_repeat_ LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_repeat_ repeat_five_match_repeat_: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_four_match_repeat_: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_three_match_repeat_: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_two_match_repeat_: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_two_offset_match_repeat_: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_as_copy_encodeBlockAsm14B: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm14B CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm14B - MOVB $0xff, (SI) - MOVD BP, 1(SI) + MOVB $0xff, (DI) + MOVD SI, 1(DI) LEAQ -64(BX), BX - ADDQ $0x05, SI + ADDQ $0x05, DI CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm14B emit_repeat_again_repeat_as_copy_encodeBlockAsm14B_emit_copy: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm14B_emit_copy - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy: @@ -1888,52 +1888,52 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm14B_emit_copy LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm14B_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm14B_emit_copy: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_four_repeat_as_copy_encodeBlockAsm14B_emit_copy: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_three_repeat_as_copy_encodeBlockAsm14B_emit_copy: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_two_repeat_as_copy_encodeBlockAsm14B_emit_copy: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14B four_bytes_remain_repeat_as_copy_encodeBlockAsm14B: @@ -1941,27 +1941,27 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm14B: JZ repeat_end_emit_encodeBlockAsm14B MOVB $0x03, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + MOVB BL, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm14B two_byte_offset_repeat_as_copy_encodeBlockAsm14B: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm14B - MOVB $0xee, (SI) - MOVW BP, 1(SI) + MOVB $0xee, (DI) + MOVW SI, 1(DI) LEAQ -60(BX), BX - ADDQ $0x03, SI + ADDQ $0x03, DI emit_repeat_again_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm14B_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: @@ -1972,100 +1972,100 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm14B_emit_copy_short LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm14B_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_four_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_three_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_two_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14B repeat_two_offset_repeat_as_copy_encodeBlockAsm14B_emit_copy_short: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14B two_byte_offset_short_repeat_as_copy_encodeBlockAsm14B: CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm14B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm14B MOVB $0x01, DL LEAQ -16(DX)(BX*4), BX - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14B emit_copy_three_repeat_as_copy_encodeBlockAsm14B: MOVB $0x02, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + MOVB BL, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI repeat_end_emit_encodeBlockAsm14B: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL 16(SP), BX CMPL AX, BX JGT emit_remainder_encodeBlockAsm14B JMP search_loop_encodeBlockAsm14B no_repeat_found_encodeBlockAsm14B: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ BP, DI - SHRQ $0x10, DI - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - CMPL (CX)(BX*1), BP - SHRQ $0x08, BP + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ SI, R8 + SHRQ $0x10, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + CMPL (CX)(BX*1), SI + SHRQ $0x08, SI JEQ candidate_match_encodeBlockAsm14B - MOVL 32(SP)(DI*1), BX - CMPL (CX)(SI*1), BP + MOVL 32(SP)(R8*1), BX + CMPL (CX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm14B - LEAQ 2(AX), SI - MOVL SI, 32(SP)(DI*1) - SHRQ $0x08, BP - CMPL (CX)(BX*1), BP + LEAQ 2(AX), DI + MOVL DI, 32(SP)(R8*1) + SHRQ $0x08, SI + CMPL (CX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm14B MOVL 28(SP), AX JMP search_loop_encodeBlockAsm14B @@ -2076,21 +2076,21 @@ candidate3_match_encodeBlockAsm14B: candidate2_match_encodeBlockAsm14B: LEAQ -2(AX), BX - MOVL BX, 32(SP)(DI*1) + MOVL BX, 32(SP)(R8*1) INCL AX - MOVL SI, BX + MOVL DI, BX candidate_match_encodeBlockAsm14B: - MOVL 20(SP), BP + MOVL 20(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm14B match_extend_back_loop_encodeBlockAsm14B: - CMPL AX, BP + CMPL AX, SI JG match_extend_back_end_encodeBlockAsm14B MOVB -1(CX)(BX*1), DL - MOVB -1(CX)(AX*1), SI - CMPB DL, SI + MOVB -1(CX)(AX*1), DI + CMPB DL, DI JNE match_extend_back_end_encodeBlockAsm14B LEAL -1(AX), AX DECL BX @@ -2098,507 +2098,507 @@ match_extend_back_loop_encodeBlockAsm14B: JMP match_extend_back_loop_encodeBlockAsm14B match_extend_back_end_encodeBlockAsm14B: - MOVL AX, BP - SUBL 20(SP), BP - LEAQ dst_base+0(FP)(BP*1), BP - CMPQ BP, (SP) + MOVL AX, SI + SUBL 20(SP), SI + LEAQ dst_base+0(FP)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm14B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm14B: - MOVL BX, BP - MOVL 20(SP), SI - CMPL SI, BP + MOVL BX, SI + MOVL 20(SP), DI + CMPL DI, SI JEQ emit_literal_skip_match_emit_encodeBlockAsm14B - MOVL BP, DI - MOVL BP, 20(SP) - LEAQ (CX)(SI*1), BP - SUBL SI, DI - MOVQ dst_base+0(FP), SI - MOVQ DI, R8 - SUBL $0x01, R8 + MOVL SI, R8 + MOVL SI, 20(SP) + LEAQ (CX)(DI*1), SI + SUBL DI, R8 + MOVQ dst_base+0(FP), DI + MOVQ R8, R9 + SUBL $0x01, R9 JC emit_literal_done_match_emit_encodeBlockAsm14B - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_match_emit_encodeBlockAsm14B - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm14B - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsm14B - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_match_emit_encodeBlockAsm14B - MOVB $0xfc, (SI) - MOVL R8, 1(SI) - ADDQ $0x05, SI + MOVB $0xfc, (DI) + MOVL R9, 1(DI) + ADDQ $0x05, DI JMP memmove_match_emit_encodeBlockAsm14B four_bytes_match_emit_encodeBlockAsm14B: - MOVQ R8, R9 - SHRL $0x10, R9 - MOVB $0xf8, (SI) - MOVW R8, 1(SI) - MOVB R9, 3(SI) - ADDQ $0x04, SI + MOVQ R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (DI) + MOVW R9, 1(DI) + MOVB R10, 3(DI) + ADDQ $0x04, DI JMP memmove_match_emit_encodeBlockAsm14B three_bytes_match_emit_encodeBlockAsm14B: - MOVB $0xf4, (SI) - MOVW R8, 1(SI) - ADDQ $0x03, SI + MOVB $0xf4, (DI) + MOVW R9, 1(DI) + ADDQ $0x03, DI JMP memmove_match_emit_encodeBlockAsm14B two_bytes_match_emit_encodeBlockAsm14B: - MOVB $0xf0, (SI) - MOVB R8, 1(SI) - ADDQ $0x02, SI + MOVB $0xf0, (DI) + MOVB R9, 1(DI) + ADDQ $0x02, DI JMP memmove_match_emit_encodeBlockAsm14B one_byte_match_emit_encodeBlockAsm14B: - SHLB $0x02, R8 - MOVB R8, (SI) - ADDQ $0x01, SI + SHLB $0x02, R9 + MOVB R9, (DI) + ADDQ $0x01, DI memmove_match_emit_encodeBlockAsm14B: - LEAQ (SI)(DI*1), R8 + LEAQ (DI)(R8*1), R9 NOP emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_tail: - TESTQ DI, DI + TESTQ R8, R8 JEQ emit_literal_done_match_emit_encodeBlockAsm14B - CMPQ DI, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_1or2 - CMPQ DI, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_3 JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_4 - CMPQ DI, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_5through7 JE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_8 - CMPQ DI, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_9through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_17through32 - CMPQ DI, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_33through64 - CMPQ DI, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_65through128 - CMPQ DI, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_129through256 JMP emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_1or2: - MOVB (BP), R8 - MOVB -1(BP)(DI*1), BP - MOVB R8, (SI) - MOVB BP, -1(SI)(DI*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (DI) + MOVB SI, -1(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_4: - MOVL (BP), R8 - MOVL R8, (SI) + MOVL (SI), R9 + MOVL R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_3: - MOVW (BP), R8 - MOVB 2(BP), BP - MOVW R8, (SI) - MOVB BP, 2(SI) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (DI) + MOVB SI, 2(DI) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_5through7: - MOVL (BP), R8 - MOVL -4(BP)(DI*1), BP - MOVL R8, (SI) - MOVL BP, -4(SI)(DI*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (DI) + MOVL SI, -4(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_8: - MOVQ (BP), R8 - MOVQ R8, (SI) + MOVQ (SI), R9 + MOVQ R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_9through16: - MOVQ (BP), R8 - MOVQ -8(BP)(DI*1), BP - MOVQ R8, (SI) - MOVQ BP, -8(SI)(DI*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (DI) + MOVQ SI, -8(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_17through32: - MOVOU (BP), X0 - MOVOU -16(BP)(DI*1), X1 - MOVOU X0, (SI) - MOVOU X1, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (DI) + MOVOU X1, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_33through64: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU -32(BP)(DI*1), X2 - MOVOU -16(BP)(DI*1), X3 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, -32(SI)(DI*1) - MOVOU X3, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, -32(DI)(R8*1) + MOVOU X3, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_65through128: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_129through256: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU -128(BP)(DI*1), X8 - MOVOU -112(BP)(DI*1), X9 - MOVOU -96(BP)(DI*1), X10 - MOVOU -80(BP)(DI*1), X11 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, -128(SI)(DI*1) - MOVOU X9, -112(SI)(DI*1) - MOVOU X10, -96(SI)(DI*1) - MOVOU X11, -80(SI)(DI*1) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, -128(DI)(R8*1) + MOVOU X9, -112(DI)(R8*1) + MOVOU X10, -96(DI)(R8*1) + MOVOU X11, -80(DI)(R8*1) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14B emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_256through2048: - LEAQ -256(DI), DI - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU 128(BP), X8 - MOVOU 144(BP), X9 - MOVOU 160(BP), X10 - MOVOU 176(BP), X11 - MOVOU 192(BP), X12 - MOVOU 208(BP), X13 - MOVOU 224(BP), X14 - MOVOU 240(BP), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, 128(SI) - MOVOU X9, 144(SI) - MOVOU X10, 160(SI) - MOVOU X11, 176(SI) - MOVOU X12, 192(SI) - MOVOU X13, 208(SI) - MOVOU X14, 224(SI) - MOVOU X15, 240(SI) - CMPQ DI, $0x00000100 - LEAQ 256(BP), BP + LEAQ -256(R8), R8 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, 128(DI) + MOVOU X9, 144(DI) + MOVOU X10, 160(DI) + MOVOU X11, 176(DI) + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + CMPQ R8, $0x00000100 LEAQ 256(SI), SI + LEAQ 256(DI), DI JGE emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm14B_memmove_tail - MOVQ R8, SI + MOVQ R9, DI emit_literal_done_match_emit_encodeBlockAsm14B: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) emit_literal_skip_match_emit_encodeBlockAsm14B: NOP match_nolit_loop_encodeBlockAsm14B: - MOVL AX, BP - MOVL AX, BP - SUBL BX, BP - MOVL BP, 24(SP) + MOVL AX, SI + MOVL AX, SI + SUBL BX, SI + MOVL SI, 24(SP) ADDL $0x04, AX ADDL $0x04, BX - MOVL 16(SP), BP - SUBL AX, BP - XORQ DI, DI - CMPQ BP, $0x08 + MOVL 16(SP), SI + SUBL AX, SI + XORQ R8, R8 + CMPQ SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm14B matchlen_loopback_match_nolit_encodeBlockAsm14B: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_match_nolit_encodeBlockAsm14B - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP match_nolit_end_encodeBlockAsm14B matchlen_loop_match_nolit_encodeBlockAsm14B: - LEAQ -8(BP), BP - LEAQ 8(DI), DI - CMPQ BP, $0x08 + LEAQ -8(SI), SI + LEAQ 8(R8), R8 + CMPQ SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm14B matchlen_single_match_nolit_encodeBlockAsm14B: - TESTQ BP, BP + TESTQ SI, SI JZ match_nolit_end_encodeBlockAsm14B matchlen_single_loopback_match_nolit_encodeBlockAsm14B: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE match_nolit_end_encodeBlockAsm14B - LEAQ 1(DI), DI - DECQ BP + LEAQ 1(R8), R8 + DECQ SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm14B match_nolit_end_encodeBlockAsm14B: - MOVL 24(SP), BP - ADDQ $0x04, DI - MOVQ dst_base+0(FP), SI - ADDL DI, AX - CMPL BP, $0x00010000 + MOVL 24(SP), SI + ADDQ $0x04, R8 + MOVQ dst_base+0(FP), DI + ADDL R8, AX + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm14B - CMPL DI, $0x40 + CMPL R8, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm14B - MOVB $0xff, (SI) - MOVD BP, 1(SI) - LEAQ -64(DI), DI - ADDQ $0x05, SI - CMPL DI, $0x04 + MOVB $0xff, (DI) + MOVD SI, 1(DI) + LEAQ -64(R8), R8 + ADDQ $0x05, DI + CMPL R8, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm14B emit_repeat_again_match_nolit_encodeBlockAsm14B_emit_copy: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm14B_emit_copy - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm14B_emit_copy - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm14B_emit_copy - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm14B_emit_copy - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm14B_emit_copy repeat_five_match_nolit_encodeBlockAsm14B_emit_copy: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B repeat_four_match_nolit_encodeBlockAsm14B_emit_copy: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B repeat_three_match_nolit_encodeBlockAsm14B_emit_copy: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B repeat_two_match_nolit_encodeBlockAsm14B_emit_copy: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B four_bytes_remain_match_nolit_encodeBlockAsm14B: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_emitcopy_end_encodeBlockAsm14B MOVB $0x03, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B two_byte_offset_match_nolit_encodeBlockAsm14B: - CMPL DI, $0x40 + CMPL R8, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm14B - MOVB $0xee, (SI) - MOVW BP, 1(SI) - LEAQ -60(DI), DI - ADDQ $0x03, SI + MOVB $0xee, (DI) + MOVW SI, 1(DI) + LEAQ -60(R8), R8 + ADDQ $0x03, DI emit_repeat_again_match_nolit_encodeBlockAsm14B_emit_copy_short: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm14B_emit_copy_short - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy_short: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm14B_emit_copy_short - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm14B_emit_copy_short - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm14B_emit_copy_short - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm14B_emit_copy_short repeat_five_match_nolit_encodeBlockAsm14B_emit_copy_short: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B repeat_four_match_nolit_encodeBlockAsm14B_emit_copy_short: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B repeat_three_match_nolit_encodeBlockAsm14B_emit_copy_short: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B repeat_two_match_nolit_encodeBlockAsm14B_emit_copy_short: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B repeat_two_offset_match_nolit_encodeBlockAsm14B_emit_copy_short: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B two_byte_offset_short_match_nolit_encodeBlockAsm14B: - CMPL DI, $0x0c + CMPL R8, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm14B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm14B MOVB $0x01, DL - LEAQ -16(DX)(DI*4), DI - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + LEAQ -16(DX)(R8*4), R8 + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14B emit_copy_three_match_nolit_encodeBlockAsm14B: MOVB $0x02, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI match_nolit_emitcopy_end_encodeBlockAsm14B: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL AX, 20(SP) CMPL AX, 16(SP) JGE emit_remainder_encodeBlockAsm14B - CMPQ SI, (SP) + CMPQ DI, (SP) JL match_nolit_dst_ok_encodeBlockAsm14B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm14B: - MOVQ -2(CX)(AX*1), BP - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ BP, DI - SHRQ $0x10, BP - MOVQ BP, R8 - SHLQ $0x10, DI - IMULQ SI, DI - SHRQ $0x32, DI + MOVQ -2(CX)(AX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ SI, R8 + SHRQ $0x10, SI + MOVQ SI, R9 SHLQ $0x10, R8 - IMULQ SI, R8 + IMULQ DI, R8 SHRQ $0x32, R8 - MOVL 32(SP)(DI*1), SI - MOVL 32(SP)(R8*1), SI - LEAQ -2(AX), SI - MOVL SI, 32(SP)(DI*1) - MOVL AX, 32(SP)(R8*1) - CMPL (CX)(R8*1), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x32, R9 + MOVL 32(SP)(R8*1), DI + MOVL 32(SP)(R9*1), DI + LEAQ -2(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, 32(SP)(R9*1) + CMPL (CX)(R9*1), SI JEQ match_nolit_loop_encodeBlockAsm14B INCL AX JMP search_loop_encodeBlockAsm14B @@ -2640,11 +2640,11 @@ emit_remainder_ok_encodeBlockAsm14B: JMP memmove_emit_remainder_encodeBlockAsm14B four_bytes_emit_remainder_encodeBlockAsm14B: - MOVQ DX, BP - SHRL $0x10, BP + MOVQ DX, SI + SHRL $0x10, SI MOVB $0xf8, (CX) MOVW DX, 1(CX) - MOVB BP, 3(CX) + MOVB SI, 3(CX) ADDQ $0x04, CX JMP memmove_emit_remainder_encodeBlockAsm14B @@ -2887,224 +2887,224 @@ zero_loop_encodeBlockAsm12B: MOVQ src_base+24(FP), CX search_loop_encodeBlockAsm12B: - MOVQ (CX)(AX*1), BP + MOVQ (CX)(AX*1), SI MOVL AX, BX SUBL 20(SP), BX SHRL $0x04, BX LEAQ 4(AX)(BX*1), BX - MOVL 16(SP), SI - CMPL BX, SI + MOVL 16(SP), DI + CMPL BX, DI JGT emit_remainder_encodeBlockAsm12B MOVL BX, 28(SP) MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ BP, DI - MOVQ BP, R8 - SHRQ $0x08, R8 - SHLQ $0x10, DI - IMULQ BX, DI - SHRQ $0x34, DI + MOVQ SI, R8 + MOVQ SI, R9 + SHRQ $0x08, R9 SHLQ $0x10, R8 IMULQ BX, R8 SHRQ $0x34, R8 - MOVL 32(SP)(DI*1), BX - MOVL 32(SP)(R8*1), SI - MOVL AX, 32(SP)(DI*1) - LEAL 1(AX), DI - MOVL DI, 32(SP)(R8*1) - MOVL AX, DI - SUBL 24(SP), DI - MOVL 1(CX)(DI*1), R9 - MOVQ BP, R8 - SHLQ $0x08, R8 - CMPL R8, R9 + SHLQ $0x10, R9 + IMULQ BX, R9 + SHRQ $0x34, R9 + MOVL 32(SP)(R8*1), BX + MOVL 32(SP)(R9*1), DI + MOVL AX, 32(SP)(R8*1) + LEAL 1(AX), R8 + MOVL R8, 32(SP)(R9*1) + MOVL AX, R8 + SUBL 24(SP), R8 + MOVL 1(CX)(R8*1), R10 + MOVQ SI, R9 + SHLQ $0x08, R9 + CMPL R9, R10 JNE no_repeat_found_encodeBlockAsm12B - LEAQ 1(AX), BP + LEAQ 1(AX), SI MOVL 20(SP), BX - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_back_end_encodeBlockAsm12B repeat_extend_back_loop_encodeBlockAsm12B: - CMPL BP, BX + CMPL SI, BX JG repeat_extend_back_end_encodeBlockAsm12B - MOVB -1(CX)(DI*1), DL - MOVB -1(CX)(BP*1), SI - CMPB DL, SI + MOVB -1(CX)(R8*1), DL + MOVB -1(CX)(SI*1), DI + CMPB DL, DI JNE repeat_extend_back_end_encodeBlockAsm12B - LEAQ -1(BP), BP - DECL DI + LEAQ -1(SI), SI + DECL R8 JZ repeat_extend_back_end_encodeBlockAsm12B JMP repeat_extend_back_loop_encodeBlockAsm12B repeat_extend_back_end_encodeBlockAsm12B: MOVL 20(SP), BX - CMPL BX, BP + CMPL BX, SI JEQ emit_literal_skip_repeat_emit_encodeBlockAsm12B - MOVL BP, SI - MOVL BP, 20(SP) - LEAQ (CX)(BX*1), DI - SUBL BX, SI + MOVL SI, DI + MOVL SI, 20(SP) + LEAQ (CX)(BX*1), R8 + SUBL BX, DI MOVQ dst_base+0(FP), BX - MOVQ SI, R8 - SUBL $0x01, R8 + MOVQ DI, R9 + SUBL $0x01, R9 JC emit_literal_done_repeat_emit_encodeBlockAsm12B - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm12B - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm12B - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm12B - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_repeat_emit_encodeBlockAsm12B MOVB $0xfc, (BX) - MOVL R8, 1(BX) + MOVL R9, 1(BX) ADDQ $0x05, BX JMP memmove_repeat_emit_encodeBlockAsm12B four_bytes_repeat_emit_encodeBlockAsm12B: - MOVQ R8, R9 - SHRL $0x10, R9 + MOVQ R9, R10 + SHRL $0x10, R10 MOVB $0xf8, (BX) - MOVW R8, 1(BX) - MOVB R9, 3(BX) + MOVW R9, 1(BX) + MOVB R10, 3(BX) ADDQ $0x04, BX JMP memmove_repeat_emit_encodeBlockAsm12B three_bytes_repeat_emit_encodeBlockAsm12B: MOVB $0xf4, (BX) - MOVW R8, 1(BX) + MOVW R9, 1(BX) ADDQ $0x03, BX JMP memmove_repeat_emit_encodeBlockAsm12B two_bytes_repeat_emit_encodeBlockAsm12B: MOVB $0xf0, (BX) - MOVB R8, 1(BX) + MOVB R9, 1(BX) ADDQ $0x02, BX JMP memmove_repeat_emit_encodeBlockAsm12B one_byte_repeat_emit_encodeBlockAsm12B: - SHLB $0x02, R8 - MOVB R8, (BX) + SHLB $0x02, R9 + MOVB R9, (BX) ADDQ $0x01, BX memmove_repeat_emit_encodeBlockAsm12B: - LEAQ (BX)(SI*1), R8 + LEAQ (BX)(DI*1), R9 NOP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_tail: - TESTQ SI, SI + TESTQ DI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B - CMPQ SI, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 - CMPQ SI, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4 - CMPQ SI, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ SI, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_9through16 - CMPQ SI, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 - CMPQ SI, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 - CMPQ SI, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_65through128 - CMPQ SI, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2: - MOVB (DI), R8 - MOVB -1(DI)(SI*1), DI - MOVB R8, (BX) - MOVB DI, -1(BX)(SI*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (BX) + MOVB R8, -1(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4: - MOVL (DI), R8 - MOVL R8, (BX) + MOVL (R8), R9 + MOVL R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3: - MOVW (DI), R8 - MOVB 2(DI), DI - MOVW R8, (BX) - MOVB DI, 2(BX) + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (BX) + MOVB R8, 2(BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_5through7: - MOVL (DI), R8 - MOVL -4(DI)(SI*1), DI - MOVL R8, (BX) - MOVL DI, -4(BX)(SI*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (BX) + MOVL R8, -4(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (DI), R8 - MOVQ R8, (BX) + MOVQ (R8), R9 + MOVQ R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_9through16: - MOVQ (DI), R8 - MOVQ -8(DI)(SI*1), DI - MOVQ R8, (BX) - MOVQ DI, -8(BX)(SI*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (BX) + MOVQ R8, -8(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(SI*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (BX) - MOVOU X1, -16(BX)(SI*1) + MOVOU X1, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(SI*1), X2 - MOVOU -16(DI)(SI*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (BX) MOVOU X1, 16(BX) - MOVOU X2, -32(BX)(SI*1) - MOVOU X3, -16(BX)(SI*1) + MOVOU X2, -32(BX)(DI*1) + MOVOU X3, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_65through128: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) MOVOU X3, 48(BX) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU -128(DI)(SI*1), X8 - MOVOU -112(DI)(SI*1), X9 - MOVOU -96(DI)(SI*1), X10 - MOVOU -80(DI)(SI*1), X11 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -3113,34 +3113,34 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256: MOVOU X5, 80(BX) MOVOU X6, 96(BX) MOVOU X7, 112(BX) - MOVOU X8, -128(BX)(SI*1) - MOVOU X9, -112(BX)(SI*1) - MOVOU X10, -96(BX)(SI*1) - MOVOU X11, -80(BX)(SI*1) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X8, -128(BX)(DI*1) + MOVOU X9, -112(BX)(DI*1) + MOVOU X10, -96(BX)(DI*1) + MOVOU X11, -80(BX)(DI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048: - LEAQ -256(SI), SI - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU 128(DI), X8 - MOVOU 144(DI), X9 - MOVOU 160(DI), X10 - MOVOU 176(DI), X11 - MOVOU 192(DI), X12 - MOVOU 208(DI), X13 - MOVOU 224(DI), X14 - MOVOU 240(DI), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -3157,12 +3157,12 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048: MOVOU X13, 208(BX) MOVOU X14, 224(BX) MOVOU X15, 240(BX) - CMPQ SI, $0x00000100 - LEAQ 256(DI), DI + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(BX), BX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_tail - MOVQ R8, BX + MOVQ R9, BX emit_literal_done_repeat_emit_encodeBlockAsm12B: MOVQ BX, dst_base+0(FP) @@ -3173,23 +3173,23 @@ emit_literal_skip_repeat_emit_encodeBlockAsm12B: SUBL 24(SP), BX MOVL 16(SP), BX SUBL AX, BX - XORQ DI, DI + XORQ R8, R8 CMPQ BX, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_repeat_extend - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_loop_repeat_extend: LEAQ -8(BX), BX - LEAQ 8(DI), DI + LEAQ 8(R8), R8 CMPQ BX, $0x08 JGE matchlen_loopback_repeat_extend @@ -3198,31 +3198,31 @@ matchlen_single_repeat_extend: JZ repeat_extend_forward_end_encodeBlockAsm12B matchlen_single_loopback_repeat_extend: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE repeat_extend_forward_end_encodeBlockAsm12B - LEAQ 1(DI), DI + LEAQ 1(R8), R8 DECQ BX JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm12B: - ADDL DI, AX + ADDL R8, AX MOVL AX, BX - SUBL BP, BX - MOVL 24(SP), BP - MOVQ dst_base+0(FP), SI - MOVL 20(SP), DI - TESTL DI, DI + SUBL SI, BX + MOVL 24(SP), SI + MOVQ dst_base+0(FP), DI + MOVL 20(SP), R8 + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm12B emit_repeat_again_match_repeat_: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_match_repeat_ - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_ - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_ cant_repeat_two_offset_match_repeat_: @@ -3233,74 +3233,74 @@ cant_repeat_two_offset_match_repeat_: CMPL BX, $0x0100ffff JLT repeat_five_match_repeat_ LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_repeat_ repeat_five_match_repeat_: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_four_match_repeat_: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_three_match_repeat_: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_two_match_repeat_: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_match_repeat_: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_as_copy_encodeBlockAsm12B: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm12B CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm12B - MOVB $0xff, (SI) - MOVD BP, 1(SI) + MOVB $0xff, (DI) + MOVD SI, 1(DI) LEAQ -64(BX), BX - ADDQ $0x05, SI + ADDQ $0x05, DI CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm12B emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy: @@ -3311,52 +3311,52 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12B four_bytes_remain_repeat_as_copy_encodeBlockAsm12B: @@ -3364,27 +3364,27 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm12B: JZ repeat_end_emit_encodeBlockAsm12B MOVB $0x03, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + MOVB BL, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm12B two_byte_offset_repeat_as_copy_encodeBlockAsm12B: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B - MOVB $0xee, (SI) - MOVW BP, 1(SI) + MOVB $0xee, (DI) + MOVW SI, 1(DI) LEAQ -60(BX), BX - ADDQ $0x03, SI + ADDQ $0x03, DI emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: @@ -3395,100 +3395,100 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy_short LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B MOVB $0x01, DL LEAQ -16(DX)(BX*4), BX - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12B emit_copy_three_repeat_as_copy_encodeBlockAsm12B: MOVB $0x02, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + MOVB BL, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI repeat_end_emit_encodeBlockAsm12B: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL 16(SP), BX CMPL AX, BX JGT emit_remainder_encodeBlockAsm12B JMP search_loop_encodeBlockAsm12B no_repeat_found_encodeBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ BP, DI - SHRQ $0x10, DI - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x34, DI - CMPL (CX)(BX*1), BP - SHRQ $0x08, BP + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ SI, R8 + SHRQ $0x10, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + CMPL (CX)(BX*1), SI + SHRQ $0x08, SI JEQ candidate_match_encodeBlockAsm12B - MOVL 32(SP)(DI*1), BX - CMPL (CX)(SI*1), BP + MOVL 32(SP)(R8*1), BX + CMPL (CX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm12B - LEAQ 2(AX), SI - MOVL SI, 32(SP)(DI*1) - SHRQ $0x08, BP - CMPL (CX)(BX*1), BP + LEAQ 2(AX), DI + MOVL DI, 32(SP)(R8*1) + SHRQ $0x08, SI + CMPL (CX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm12B MOVL 28(SP), AX JMP search_loop_encodeBlockAsm12B @@ -3499,21 +3499,21 @@ candidate3_match_encodeBlockAsm12B: candidate2_match_encodeBlockAsm12B: LEAQ -2(AX), BX - MOVL BX, 32(SP)(DI*1) + MOVL BX, 32(SP)(R8*1) INCL AX - MOVL SI, BX + MOVL DI, BX candidate_match_encodeBlockAsm12B: - MOVL 20(SP), BP + MOVL 20(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm12B match_extend_back_loop_encodeBlockAsm12B: - CMPL AX, BP + CMPL AX, SI JG match_extend_back_end_encodeBlockAsm12B MOVB -1(CX)(BX*1), DL - MOVB -1(CX)(AX*1), SI - CMPB DL, SI + MOVB -1(CX)(AX*1), DI + CMPB DL, DI JNE match_extend_back_end_encodeBlockAsm12B LEAL -1(AX), AX DECL BX @@ -3521,507 +3521,507 @@ match_extend_back_loop_encodeBlockAsm12B: JMP match_extend_back_loop_encodeBlockAsm12B match_extend_back_end_encodeBlockAsm12B: - MOVL AX, BP - SUBL 20(SP), BP - LEAQ dst_base+0(FP)(BP*1), BP - CMPQ BP, (SP) + MOVL AX, SI + SUBL 20(SP), SI + LEAQ dst_base+0(FP)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm12B: - MOVL BX, BP - MOVL 20(SP), SI - CMPL SI, BP + MOVL BX, SI + MOVL 20(SP), DI + CMPL DI, SI JEQ emit_literal_skip_match_emit_encodeBlockAsm12B - MOVL BP, DI - MOVL BP, 20(SP) - LEAQ (CX)(SI*1), BP - SUBL SI, DI - MOVQ dst_base+0(FP), SI - MOVQ DI, R8 - SUBL $0x01, R8 + MOVL SI, R8 + MOVL SI, 20(SP) + LEAQ (CX)(DI*1), SI + SUBL DI, R8 + MOVQ dst_base+0(FP), DI + MOVQ R8, R9 + SUBL $0x01, R9 JC emit_literal_done_match_emit_encodeBlockAsm12B - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_match_emit_encodeBlockAsm12B - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm12B - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsm12B - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_match_emit_encodeBlockAsm12B - MOVB $0xfc, (SI) - MOVL R8, 1(SI) - ADDQ $0x05, SI + MOVB $0xfc, (DI) + MOVL R9, 1(DI) + ADDQ $0x05, DI JMP memmove_match_emit_encodeBlockAsm12B four_bytes_match_emit_encodeBlockAsm12B: - MOVQ R8, R9 - SHRL $0x10, R9 - MOVB $0xf8, (SI) - MOVW R8, 1(SI) - MOVB R9, 3(SI) - ADDQ $0x04, SI + MOVQ R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (DI) + MOVW R9, 1(DI) + MOVB R10, 3(DI) + ADDQ $0x04, DI JMP memmove_match_emit_encodeBlockAsm12B three_bytes_match_emit_encodeBlockAsm12B: - MOVB $0xf4, (SI) - MOVW R8, 1(SI) - ADDQ $0x03, SI + MOVB $0xf4, (DI) + MOVW R9, 1(DI) + ADDQ $0x03, DI JMP memmove_match_emit_encodeBlockAsm12B two_bytes_match_emit_encodeBlockAsm12B: - MOVB $0xf0, (SI) - MOVB R8, 1(SI) - ADDQ $0x02, SI + MOVB $0xf0, (DI) + MOVB R9, 1(DI) + ADDQ $0x02, DI JMP memmove_match_emit_encodeBlockAsm12B one_byte_match_emit_encodeBlockAsm12B: - SHLB $0x02, R8 - MOVB R8, (SI) - ADDQ $0x01, SI + SHLB $0x02, R9 + MOVB R9, (DI) + ADDQ $0x01, DI memmove_match_emit_encodeBlockAsm12B: - LEAQ (SI)(DI*1), R8 + LEAQ (DI)(R8*1), R9 NOP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_tail: - TESTQ DI, DI + TESTQ R8, R8 JEQ emit_literal_done_match_emit_encodeBlockAsm12B - CMPQ DI, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2 - CMPQ DI, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4 - CMPQ DI, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_5through7 JE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ DI, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_9through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 - CMPQ DI, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 - CMPQ DI, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_65through128 - CMPQ DI, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_129through256 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2: - MOVB (BP), R8 - MOVB -1(BP)(DI*1), BP - MOVB R8, (SI) - MOVB BP, -1(SI)(DI*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (DI) + MOVB SI, -1(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4: - MOVL (BP), R8 - MOVL R8, (SI) + MOVL (SI), R9 + MOVL R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3: - MOVW (BP), R8 - MOVB 2(BP), BP - MOVW R8, (SI) - MOVB BP, 2(SI) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (DI) + MOVB SI, 2(DI) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_5through7: - MOVL (BP), R8 - MOVL -4(BP)(DI*1), BP - MOVL R8, (SI) - MOVL BP, -4(SI)(DI*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (DI) + MOVL SI, -4(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (BP), R8 - MOVQ R8, (SI) + MOVQ (SI), R9 + MOVQ R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_9through16: - MOVQ (BP), R8 - MOVQ -8(BP)(DI*1), BP - MOVQ R8, (SI) - MOVQ BP, -8(SI)(DI*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (DI) + MOVQ SI, -8(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (BP), X0 - MOVOU -16(BP)(DI*1), X1 - MOVOU X0, (SI) - MOVOU X1, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (DI) + MOVOU X1, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU -32(BP)(DI*1), X2 - MOVOU -16(BP)(DI*1), X3 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, -32(SI)(DI*1) - MOVOU X3, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, -32(DI)(R8*1) + MOVOU X3, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_65through128: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_129through256: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU -128(BP)(DI*1), X8 - MOVOU -112(BP)(DI*1), X9 - MOVOU -96(BP)(DI*1), X10 - MOVOU -80(BP)(DI*1), X11 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, -128(SI)(DI*1) - MOVOU X9, -112(SI)(DI*1) - MOVOU X10, -96(SI)(DI*1) - MOVOU X11, -80(SI)(DI*1) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, -128(DI)(R8*1) + MOVOU X9, -112(DI)(R8*1) + MOVOU X10, -96(DI)(R8*1) + MOVOU X11, -80(DI)(R8*1) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048: - LEAQ -256(DI), DI - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU 128(BP), X8 - MOVOU 144(BP), X9 - MOVOU 160(BP), X10 - MOVOU 176(BP), X11 - MOVOU 192(BP), X12 - MOVOU 208(BP), X13 - MOVOU 224(BP), X14 - MOVOU 240(BP), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, 128(SI) - MOVOU X9, 144(SI) - MOVOU X10, 160(SI) - MOVOU X11, 176(SI) - MOVOU X12, 192(SI) - MOVOU X13, 208(SI) - MOVOU X14, 224(SI) - MOVOU X15, 240(SI) - CMPQ DI, $0x00000100 - LEAQ 256(BP), BP + LEAQ -256(R8), R8 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, 128(DI) + MOVOU X9, 144(DI) + MOVOU X10, 160(DI) + MOVOU X11, 176(DI) + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + CMPQ R8, $0x00000100 LEAQ 256(SI), SI + LEAQ 256(DI), DI JGE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_tail - MOVQ R8, SI + MOVQ R9, DI emit_literal_done_match_emit_encodeBlockAsm12B: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) emit_literal_skip_match_emit_encodeBlockAsm12B: NOP match_nolit_loop_encodeBlockAsm12B: - MOVL AX, BP - MOVL AX, BP - SUBL BX, BP - MOVL BP, 24(SP) + MOVL AX, SI + MOVL AX, SI + SUBL BX, SI + MOVL SI, 24(SP) ADDL $0x04, AX ADDL $0x04, BX - MOVL 16(SP), BP - SUBL AX, BP - XORQ DI, DI - CMPQ BP, $0x08 + MOVL 16(SP), SI + SUBL AX, SI + XORQ R8, R8 + CMPQ SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm12B matchlen_loopback_match_nolit_encodeBlockAsm12B: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_match_nolit_encodeBlockAsm12B - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP match_nolit_end_encodeBlockAsm12B matchlen_loop_match_nolit_encodeBlockAsm12B: - LEAQ -8(BP), BP - LEAQ 8(DI), DI - CMPQ BP, $0x08 + LEAQ -8(SI), SI + LEAQ 8(R8), R8 + CMPQ SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm12B matchlen_single_match_nolit_encodeBlockAsm12B: - TESTQ BP, BP + TESTQ SI, SI JZ match_nolit_end_encodeBlockAsm12B matchlen_single_loopback_match_nolit_encodeBlockAsm12B: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE match_nolit_end_encodeBlockAsm12B - LEAQ 1(DI), DI - DECQ BP + LEAQ 1(R8), R8 + DECQ SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B match_nolit_end_encodeBlockAsm12B: - MOVL 24(SP), BP - ADDQ $0x04, DI - MOVQ dst_base+0(FP), SI - ADDL DI, AX - CMPL BP, $0x00010000 + MOVL 24(SP), SI + ADDQ $0x04, R8 + MOVQ dst_base+0(FP), DI + ADDL R8, AX + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm12B - CMPL DI, $0x40 + CMPL R8, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm12B - MOVB $0xff, (SI) - MOVD BP, 1(SI) - LEAQ -64(DI), DI - ADDQ $0x05, SI - CMPL DI, $0x04 + MOVB $0xff, (DI) + MOVD SI, 1(DI) + LEAQ -64(R8), R8 + ADDQ $0x05, DI + CMPL R8, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm12B emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm12B_emit_copy - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm12B_emit_copy - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy repeat_five_match_nolit_encodeBlockAsm12B_emit_copy: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_four_match_nolit_encodeBlockAsm12B_emit_copy: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B four_bytes_remain_match_nolit_encodeBlockAsm12B: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_emitcopy_end_encodeBlockAsm12B MOVB $0x03, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B two_byte_offset_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x40 + CMPL R8, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B - MOVB $0xee, (SI) - MOVW BP, 1(SI) - LEAQ -60(DI), DI - ADDQ $0x03, SI + MOVB $0xee, (DI) + MOVW SI, 1(DI) + LEAQ -60(R8), R8 + ADDQ $0x03, DI emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy_short: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm12B_emit_copy_short - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy_short repeat_five_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_four_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B two_byte_offset_short_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x0c + CMPL R8, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm12B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm12B MOVB $0x01, DL - LEAQ -16(DX)(DI*4), DI - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + LEAQ -16(DX)(R8*4), R8 + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12B emit_copy_three_match_nolit_encodeBlockAsm12B: MOVB $0x02, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI match_nolit_emitcopy_end_encodeBlockAsm12B: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL AX, 20(SP) CMPL AX, 16(SP) JGE emit_remainder_encodeBlockAsm12B - CMPQ SI, (SP) + CMPQ DI, (SP) JL match_nolit_dst_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ -2(CX)(AX*1), BP - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ BP, DI - SHRQ $0x10, BP - MOVQ BP, R8 - SHLQ $0x10, DI - IMULQ SI, DI - SHRQ $0x34, DI + MOVQ -2(CX)(AX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ SI, R8 + SHRQ $0x10, SI + MOVQ SI, R9 SHLQ $0x10, R8 - IMULQ SI, R8 + IMULQ DI, R8 SHRQ $0x34, R8 - MOVL 32(SP)(DI*1), SI - MOVL 32(SP)(R8*1), SI - LEAQ -2(AX), SI - MOVL SI, 32(SP)(DI*1) - MOVL AX, 32(SP)(R8*1) - CMPL (CX)(R8*1), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x34, R9 + MOVL 32(SP)(R8*1), DI + MOVL 32(SP)(R9*1), DI + LEAQ -2(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, 32(SP)(R9*1) + CMPL (CX)(R9*1), SI JEQ match_nolit_loop_encodeBlockAsm12B INCL AX JMP search_loop_encodeBlockAsm12B @@ -4063,11 +4063,11 @@ emit_remainder_ok_encodeBlockAsm12B: JMP memmove_emit_remainder_encodeBlockAsm12B four_bytes_emit_remainder_encodeBlockAsm12B: - MOVQ DX, BP - SHRL $0x10, BP + MOVQ DX, SI + SHRL $0x10, SI MOVB $0xf8, (CX) MOVW DX, 1(CX) - MOVB BP, 3(CX) + MOVB SI, 3(CX) ADDQ $0x04, CX JMP memmove_emit_remainder_encodeBlockAsm12B @@ -4310,224 +4310,224 @@ zero_loop_encodeBlockAsmAvx: MOVQ src_base+24(FP), CX search_loop_encodeBlockAsmAvx: - MOVQ (CX)(AX*1), BP + MOVQ (CX)(AX*1), SI MOVL AX, BX SUBL 20(SP), BX SHRL $0x06, BX LEAQ 4(AX)(BX*1), BX - MOVL 16(SP), SI - CMPL BX, SI + MOVL 16(SP), DI + CMPL BX, DI JGT emit_remainder_encodeBlockAsmAvx MOVL BX, 28(SP) MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ BP, DI - MOVQ BP, R8 - SHRQ $0x08, R8 - SHLQ $0x10, DI - IMULQ BX, DI - SHRQ $0x30, DI + MOVQ SI, R8 + MOVQ SI, R9 + SHRQ $0x08, R9 SHLQ $0x10, R8 IMULQ BX, R8 SHRQ $0x30, R8 - MOVL 32(SP)(DI*1), BX - MOVL 32(SP)(R8*1), SI - MOVL AX, 32(SP)(DI*1) - LEAL 1(AX), DI - MOVL DI, 32(SP)(R8*1) - MOVL AX, DI - SUBL 24(SP), DI - MOVL 1(CX)(DI*1), R9 - MOVQ BP, R8 - SHLQ $0x08, R8 - CMPL R8, R9 + SHLQ $0x10, R9 + IMULQ BX, R9 + SHRQ $0x30, R9 + MOVL 32(SP)(R8*1), BX + MOVL 32(SP)(R9*1), DI + MOVL AX, 32(SP)(R8*1) + LEAL 1(AX), R8 + MOVL R8, 32(SP)(R9*1) + MOVL AX, R8 + SUBL 24(SP), R8 + MOVL 1(CX)(R8*1), R10 + MOVQ SI, R9 + SHLQ $0x08, R9 + CMPL R9, R10 JNE no_repeat_found_encodeBlockAsmAvx - LEAQ 1(AX), BP + LEAQ 1(AX), SI MOVL 20(SP), BX - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_back_end_encodeBlockAsmAvx repeat_extend_back_loop_encodeBlockAsmAvx: - CMPL BP, BX + CMPL SI, BX JG repeat_extend_back_end_encodeBlockAsmAvx - MOVB -1(CX)(DI*1), DL - MOVB -1(CX)(BP*1), SI - CMPB DL, SI + MOVB -1(CX)(R8*1), DL + MOVB -1(CX)(SI*1), DI + CMPB DL, DI JNE repeat_extend_back_end_encodeBlockAsmAvx - LEAQ -1(BP), BP - DECL DI + LEAQ -1(SI), SI + DECL R8 JZ repeat_extend_back_end_encodeBlockAsmAvx JMP repeat_extend_back_loop_encodeBlockAsmAvx repeat_extend_back_end_encodeBlockAsmAvx: MOVL 20(SP), BX - CMPL BX, BP + CMPL BX, SI JEQ emit_literal_skip_repeat_emit_encodeBlockAsmAvx - MOVL BP, SI - MOVL BP, 20(SP) - LEAQ (CX)(BX*1), DI - SUBL BX, SI + MOVL SI, DI + MOVL SI, 20(SP) + LEAQ (CX)(BX*1), R8 + SUBL BX, DI MOVQ dst_base+0(FP), BX - MOVQ SI, R8 - SUBL $0x01, R8 + MOVQ DI, R9 + SUBL $0x01, R9 JC emit_literal_done_repeat_emit_encodeBlockAsmAvx - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_repeat_emit_encodeBlockAsmAvx - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsmAvx - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsmAvx - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_repeat_emit_encodeBlockAsmAvx MOVB $0xfc, (BX) - MOVL R8, 1(BX) + MOVL R9, 1(BX) ADDQ $0x05, BX JMP memmove_repeat_emit_encodeBlockAsmAvx four_bytes_repeat_emit_encodeBlockAsmAvx: - MOVQ R8, R9 - SHRL $0x10, R9 + MOVQ R9, R10 + SHRL $0x10, R10 MOVB $0xf8, (BX) - MOVW R8, 1(BX) - MOVB R9, 3(BX) + MOVW R9, 1(BX) + MOVB R10, 3(BX) ADDQ $0x04, BX JMP memmove_repeat_emit_encodeBlockAsmAvx three_bytes_repeat_emit_encodeBlockAsmAvx: MOVB $0xf4, (BX) - MOVW R8, 1(BX) + MOVW R9, 1(BX) ADDQ $0x03, BX JMP memmove_repeat_emit_encodeBlockAsmAvx two_bytes_repeat_emit_encodeBlockAsmAvx: MOVB $0xf0, (BX) - MOVB R8, 1(BX) + MOVB R9, 1(BX) ADDQ $0x02, BX JMP memmove_repeat_emit_encodeBlockAsmAvx one_byte_repeat_emit_encodeBlockAsmAvx: - SHLB $0x02, R8 - MOVB R8, (BX) + SHLB $0x02, R9 + MOVB R9, (BX) ADDQ $0x01, BX memmove_repeat_emit_encodeBlockAsmAvx: - LEAQ (BX)(SI*1), R8 + LEAQ (BX)(DI*1), R9 NOP emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_tail: - TESTQ SI, SI + TESTQ DI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsmAvx - CMPQ SI, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_1or2 - CMPQ SI, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_4 - CMPQ SI, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_8 - CMPQ SI, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_9through16 - CMPQ SI, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_17through32 - CMPQ SI, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_33through64 - CMPQ SI, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_65through128 - CMPQ SI, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_1or2: - MOVB (DI), R8 - MOVB -1(DI)(SI*1), R9 - MOVB R8, (BX) - MOVB R9, -1(BX)(SI*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R10 + MOVB R9, (BX) + MOVB R10, -1(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_4: - MOVL (DI), R8 - MOVL R8, (BX) + MOVL (R8), R9 + MOVL R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_3: - MOVW (DI), R8 - MOVB 2(DI), R9 - MOVW R8, (BX) - MOVB R9, 2(BX) + MOVW (R8), R9 + MOVB 2(R8), R10 + MOVW R9, (BX) + MOVB R10, 2(BX) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_5through7: - MOVL (DI), R8 - MOVL -4(DI)(SI*1), R9 - MOVL R8, (BX) - MOVL R9, -4(BX)(SI*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R10 + MOVL R9, (BX) + MOVL R10, -4(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_8: - MOVQ (DI), R8 - MOVQ R8, (BX) + MOVQ (R8), R9 + MOVQ R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_9through16: - MOVQ (DI), R8 - MOVQ -8(DI)(SI*1), R9 - MOVQ R8, (BX) - MOVQ R9, -8(BX)(SI*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R10 + MOVQ R9, (BX) + MOVQ R10, -8(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(SI*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (BX) - MOVOU X1, -16(BX)(SI*1) + MOVOU X1, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(SI*1), X2 - MOVOU -16(DI)(SI*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (BX) MOVOU X1, 16(BX) - MOVOU X2, -32(BX)(SI*1) - MOVOU X3, -16(BX)(SI*1) + MOVOU X2, -32(BX)(DI*1) + MOVOU X3, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_65through128: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) MOVOU X3, 48(BX) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU -128(DI)(SI*1), X8 - MOVOU -112(DI)(SI*1), X9 - MOVOU -96(DI)(SI*1), X10 - MOVOU -80(DI)(SI*1), X11 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -4536,34 +4536,34 @@ emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256: MOVOU X5, 80(BX) MOVOU X6, 96(BX) MOVOU X7, 112(BX) - MOVOU X8, -128(BX)(SI*1) - MOVOU X9, -112(BX)(SI*1) - MOVOU X10, -96(BX)(SI*1) - MOVOU X11, -80(BX)(SI*1) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X8, -128(BX)(DI*1) + MOVOU X9, -112(BX)(DI*1) + MOVOU X10, -96(BX)(DI*1) + MOVOU X11, -80(BX)(DI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_256through2048: - LEAQ -256(SI), SI - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU 128(DI), X8 - MOVOU 144(DI), X9 - MOVOU 160(DI), X10 - MOVOU 176(DI), X11 - MOVOU 192(DI), X12 - MOVOU 208(DI), X13 - MOVOU 224(DI), X14 - MOVOU 240(DI), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -4580,60 +4580,60 @@ emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_256through2048: MOVOU X13, 208(BX) MOVOU X14, 224(BX) MOVOU X15, 240(BX) - CMPQ SI, $0x00000100 - LEAQ 256(DI), DI + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(BX), BX JGE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_avxUnaligned: - LEAQ (DI)(SI*1), R9 - MOVQ BX, R11 - MOVOU -128(R9), X5 - MOVOU -112(R9), X6 - MOVQ $0x00000080, R8 + LEAQ (R8)(DI*1), R10 + MOVQ BX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, BX ADDQ $0x20, BX - MOVOU -96(R9), X7 - MOVOU -80(R9), X8 - MOVQ BX, R10 - SUBQ R11, R10 - MOVOU -64(R9), X9 - MOVOU -48(R9), X10 - SUBQ R10, SI - MOVOU -32(R9), X11 - MOVOU -16(R9), X12 - VMOVDQU (DI), Y4 - ADDQ R10, DI - SUBQ R8, SI + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ BX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, DI + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (R8), Y4 + ADDQ R11, R8 + SUBQ R9, DI emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_gobble_128_loop: - VMOVDQU (DI), Y0 - VMOVDQU 32(DI), Y1 - VMOVDQU 64(DI), Y2 - VMOVDQU 96(DI), Y3 - ADDQ R8, DI + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU 64(R8), Y2 + VMOVDQU 96(R8), Y3 + ADDQ R9, R8 VMOVDQA Y0, (BX) VMOVDQA Y1, 32(BX) VMOVDQA Y2, 64(BX) VMOVDQA Y3, 96(BX) - ADDQ R8, BX - SUBQ R8, SI + ADDQ R9, BX + SUBQ R9, DI JA emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_gobble_128_loop - ADDQ R8, SI - ADDQ BX, SI - VMOVDQU Y4, (R11) + ADDQ R9, DI + ADDQ BX, DI + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(SI) - MOVOU X6, -112(SI) - MOVOU X7, -96(SI) - MOVOU X8, -80(SI) - MOVOU X9, -64(SI) - MOVOU X10, -48(SI) - MOVOU X11, -32(SI) - MOVOU X12, -16(SI) + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) JMP emit_literal_done_repeat_emit_encodeBlockAsmAvx - MOVQ R8, BX + MOVQ R9, BX emit_literal_done_repeat_emit_encodeBlockAsmAvx: MOVQ BX, dst_base+0(FP) @@ -4644,23 +4644,23 @@ emit_literal_skip_repeat_emit_encodeBlockAsmAvx: SUBL 24(SP), BX MOVL 16(SP), BX SUBL AX, BX - XORQ DI, DI + XORQ R8, R8 CMPQ BX, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_repeat_extend - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP repeat_extend_forward_end_encodeBlockAsmAvx matchlen_loop_repeat_extend: LEAQ -8(BX), BX - LEAQ 8(DI), DI + LEAQ 8(R8), R8 CMPQ BX, $0x08 JGE matchlen_loopback_repeat_extend @@ -4669,31 +4669,31 @@ matchlen_single_repeat_extend: JZ repeat_extend_forward_end_encodeBlockAsmAvx matchlen_single_loopback_repeat_extend: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE repeat_extend_forward_end_encodeBlockAsmAvx - LEAQ 1(DI), DI + LEAQ 1(R8), R8 DECQ BX JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsmAvx: - ADDL DI, AX + ADDL R8, AX MOVL AX, BX - SUBL BP, BX - MOVL 24(SP), BP - MOVQ dst_base+0(FP), SI - MOVL 20(SP), DI - TESTL DI, DI + SUBL SI, BX + MOVL 24(SP), SI + MOVQ dst_base+0(FP), DI + MOVL 20(SP), R8 + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsmAvx emit_repeat_again_match_repeat_: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_match_repeat_ - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_ - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_ cant_repeat_two_offset_match_repeat_: @@ -4704,74 +4704,74 @@ cant_repeat_two_offset_match_repeat_: CMPL BX, $0x0100ffff JLT repeat_five_match_repeat_ LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_repeat_ repeat_five_match_repeat_: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_four_match_repeat_: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_three_match_repeat_: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_match_repeat_: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_offset_match_repeat_: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_as_copy_encodeBlockAsmAvx: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsmAvx CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx - MOVB $0xff, (SI) - MOVD BP, 1(SI) + MOVB $0xff, (DI) + MOVD SI, 1(DI) LEAQ -64(BX), BX - ADDQ $0x05, SI + ADDQ $0x05, DI CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy: @@ -4782,52 +4782,52 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsmAvx four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx: @@ -4835,27 +4835,27 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx: JZ repeat_end_emit_encodeBlockAsmAvx MOVB $0x03, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + MOVB BL, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsmAvx two_byte_offset_repeat_as_copy_encodeBlockAsmAvx: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsmAvx - MOVB $0xee, (SI) - MOVW BP, 1(SI) + MOVB $0xee, (DI) + MOVW SI, 1(DI) LEAQ -60(BX), BX - ADDQ $0x03, SI + ADDQ $0x03, DI emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: @@ -4866,100 +4866,100 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsmAvx two_byte_offset_short_repeat_as_copy_encodeBlockAsmAvx: CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsmAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsmAvx MOVB $0x01, DL LEAQ -16(DX)(BX*4), BX - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsmAvx emit_copy_three_repeat_as_copy_encodeBlockAsmAvx: MOVB $0x02, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + MOVB BL, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI repeat_end_emit_encodeBlockAsmAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL 16(SP), BX CMPL AX, BX JGT emit_remainder_encodeBlockAsmAvx JMP search_loop_encodeBlockAsmAvx no_repeat_found_encodeBlockAsmAvx: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ BP, DI - SHRQ $0x10, DI - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x30, DI - CMPL (CX)(BX*1), BP - SHRQ $0x08, BP + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ SI, R8 + SHRQ $0x10, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x30, R8 + CMPL (CX)(BX*1), SI + SHRQ $0x08, SI JEQ candidate_match_encodeBlockAsmAvx - MOVL 32(SP)(DI*1), BX - CMPL (CX)(SI*1), BP + MOVL 32(SP)(R8*1), BX + CMPL (CX)(DI*1), SI JEQ candidate2_match_encodeBlockAsmAvx - LEAQ 2(AX), SI - MOVL SI, 32(SP)(DI*1) - SHRQ $0x08, BP - CMPL (CX)(BX*1), BP + LEAQ 2(AX), DI + MOVL DI, 32(SP)(R8*1) + SHRQ $0x08, SI + CMPL (CX)(BX*1), SI JEQ candidate3_match_encodeBlockAsmAvx MOVL 28(SP), AX JMP search_loop_encodeBlockAsmAvx @@ -4970,21 +4970,21 @@ candidate3_match_encodeBlockAsmAvx: candidate2_match_encodeBlockAsmAvx: LEAQ -2(AX), BX - MOVL BX, 32(SP)(DI*1) + MOVL BX, 32(SP)(R8*1) INCL AX - MOVL SI, BX + MOVL DI, BX candidate_match_encodeBlockAsmAvx: - MOVL 20(SP), BP + MOVL 20(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsmAvx match_extend_back_loop_encodeBlockAsmAvx: - CMPL AX, BP + CMPL AX, SI JG match_extend_back_end_encodeBlockAsmAvx MOVB -1(CX)(BX*1), DL - MOVB -1(CX)(AX*1), SI - CMPB DL, SI + MOVB -1(CX)(AX*1), DI + CMPB DL, DI JNE match_extend_back_end_encodeBlockAsmAvx LEAL -1(AX), AX DECL BX @@ -4992,555 +4992,555 @@ match_extend_back_loop_encodeBlockAsmAvx: JMP match_extend_back_loop_encodeBlockAsmAvx match_extend_back_end_encodeBlockAsmAvx: - MOVL AX, BP - SUBL 20(SP), BP - LEAQ dst_base+0(FP)(BP*1), BP - CMPQ BP, (SP) + MOVL AX, SI + SUBL 20(SP), SI + LEAQ dst_base+0(FP)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsmAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsmAvx: - MOVL BX, BP - MOVL 20(SP), SI - CMPL SI, BP + MOVL BX, SI + MOVL 20(SP), DI + CMPL DI, SI JEQ emit_literal_skip_match_emit_encodeBlockAsmAvx - MOVL BP, DI - MOVL BP, 20(SP) - LEAQ (CX)(SI*1), BP - SUBL SI, DI - MOVQ dst_base+0(FP), SI - MOVQ DI, R8 - SUBL $0x01, R8 + MOVL SI, R8 + MOVL SI, 20(SP) + LEAQ (CX)(DI*1), SI + SUBL DI, R8 + MOVQ dst_base+0(FP), DI + MOVQ R8, R9 + SUBL $0x01, R9 JC emit_literal_done_match_emit_encodeBlockAsmAvx - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_match_emit_encodeBlockAsmAvx - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsmAvx - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsmAvx - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_match_emit_encodeBlockAsmAvx - MOVB $0xfc, (SI) - MOVL R8, 1(SI) - ADDQ $0x05, SI + MOVB $0xfc, (DI) + MOVL R9, 1(DI) + ADDQ $0x05, DI JMP memmove_match_emit_encodeBlockAsmAvx four_bytes_match_emit_encodeBlockAsmAvx: - MOVQ R8, R9 - SHRL $0x10, R9 - MOVB $0xf8, (SI) - MOVW R8, 1(SI) - MOVB R9, 3(SI) - ADDQ $0x04, SI + MOVQ R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (DI) + MOVW R9, 1(DI) + MOVB R10, 3(DI) + ADDQ $0x04, DI JMP memmove_match_emit_encodeBlockAsmAvx three_bytes_match_emit_encodeBlockAsmAvx: - MOVB $0xf4, (SI) - MOVW R8, 1(SI) - ADDQ $0x03, SI + MOVB $0xf4, (DI) + MOVW R9, 1(DI) + ADDQ $0x03, DI JMP memmove_match_emit_encodeBlockAsmAvx two_bytes_match_emit_encodeBlockAsmAvx: - MOVB $0xf0, (SI) - MOVB R8, 1(SI) - ADDQ $0x02, SI + MOVB $0xf0, (DI) + MOVB R9, 1(DI) + ADDQ $0x02, DI JMP memmove_match_emit_encodeBlockAsmAvx one_byte_match_emit_encodeBlockAsmAvx: - SHLB $0x02, R8 - MOVB R8, (SI) - ADDQ $0x01, SI + SHLB $0x02, R9 + MOVB R9, (DI) + ADDQ $0x01, DI memmove_match_emit_encodeBlockAsmAvx: - LEAQ (SI)(DI*1), R8 + LEAQ (DI)(R8*1), R9 NOP emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_tail: - TESTQ DI, DI + TESTQ R8, R8 JEQ emit_literal_done_match_emit_encodeBlockAsmAvx - CMPQ DI, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_1or2 - CMPQ DI, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_3 JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_4 - CMPQ DI, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_5through7 JE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_8 - CMPQ DI, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_9through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_17through32 - CMPQ DI, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_33through64 - CMPQ DI, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_65through128 - CMPQ DI, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_129through256 JMP emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_1or2: - MOVB (BP), R8 - MOVB -1(BP)(DI*1), R9 - MOVB R8, (SI) - MOVB R9, -1(SI)(DI*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (DI) + MOVB R10, -1(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_4: - MOVL (BP), R8 - MOVL R8, (SI) + MOVL (SI), R9 + MOVL R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_3: - MOVW (BP), R8 - MOVB 2(BP), R9 - MOVW R8, (SI) - MOVB R9, 2(SI) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (DI) + MOVB R10, 2(DI) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_5through7: - MOVL (BP), R8 - MOVL -4(BP)(DI*1), R9 - MOVL R8, (SI) - MOVL R9, -4(SI)(DI*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (DI) + MOVL R10, -4(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_8: - MOVQ (BP), R8 - MOVQ R8, (SI) + MOVQ (SI), R9 + MOVQ R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_9through16: - MOVQ (BP), R8 - MOVQ -8(BP)(DI*1), R9 - MOVQ R8, (SI) - MOVQ R9, -8(SI)(DI*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (DI) + MOVQ R10, -8(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_17through32: - MOVOU (BP), X0 - MOVOU -16(BP)(DI*1), X1 - MOVOU X0, (SI) - MOVOU X1, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (DI) + MOVOU X1, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_33through64: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU -32(BP)(DI*1), X2 - MOVOU -16(BP)(DI*1), X3 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, -32(SI)(DI*1) - MOVOU X3, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, -32(DI)(R8*1) + MOVOU X3, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_65through128: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_129through256: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU -128(BP)(DI*1), X8 - MOVOU -112(BP)(DI*1), X9 - MOVOU -96(BP)(DI*1), X10 - MOVOU -80(BP)(DI*1), X11 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, -128(SI)(DI*1) - MOVOU X9, -112(SI)(DI*1) - MOVOU X10, -96(SI)(DI*1) - MOVOU X11, -80(SI)(DI*1) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, -128(DI)(R8*1) + MOVOU X9, -112(DI)(R8*1) + MOVOU X10, -96(DI)(R8*1) + MOVOU X11, -80(DI)(R8*1) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_256through2048: - LEAQ -256(DI), DI - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU 128(BP), X8 - MOVOU 144(BP), X9 - MOVOU 160(BP), X10 - MOVOU 176(BP), X11 - MOVOU 192(BP), X12 - MOVOU 208(BP), X13 - MOVOU 224(BP), X14 - MOVOU 240(BP), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, 128(SI) - MOVOU X9, 144(SI) - MOVOU X10, 160(SI) - MOVOU X11, 176(SI) - MOVOU X12, 192(SI) - MOVOU X13, 208(SI) - MOVOU X14, 224(SI) - MOVOU X15, 240(SI) - CMPQ DI, $0x00000100 - LEAQ 256(BP), BP + LEAQ -256(R8), R8 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, 128(DI) + MOVOU X9, 144(DI) + MOVOU X10, 160(DI) + MOVOU X11, 176(DI) + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + CMPQ R8, $0x00000100 LEAQ 256(SI), SI + LEAQ 256(DI), DI JGE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_tail emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_avxUnaligned: - LEAQ (BP)(DI*1), R9 - MOVQ SI, R11 - MOVOU -128(R9), X5 - MOVOU -112(R9), X6 - MOVQ $0x00000080, R8 - ANDQ $0xffffffe0, SI - ADDQ $0x20, SI - MOVOU -96(R9), X7 - MOVOU -80(R9), X8 - MOVQ SI, R10 - SUBQ R11, R10 - MOVOU -64(R9), X9 - MOVOU -48(R9), X10 - SUBQ R10, DI - MOVOU -32(R9), X11 - MOVOU -16(R9), X12 - VMOVDQU (BP), Y4 - ADDQ R10, BP - SUBQ R8, DI + LEAQ (SI)(R8*1), R10 + MOVQ DI, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 + ANDQ $0xffffffe0, DI + ADDQ $0x20, DI + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ DI, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_gobble_128_loop: - VMOVDQU (BP), Y0 - VMOVDQU 32(BP), Y1 - VMOVDQU 64(BP), Y2 - VMOVDQU 96(BP), Y3 - ADDQ R8, BP - VMOVDQA Y0, (SI) - VMOVDQA Y1, 32(SI) - VMOVDQA Y2, 64(SI) - VMOVDQA Y3, 96(SI) - ADDQ R8, SI - SUBQ R8, DI + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI + VMOVDQA Y0, (DI) + VMOVDQA Y1, 32(DI) + VMOVDQA Y2, 64(DI) + VMOVDQA Y3, 96(DI) + ADDQ R9, DI + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_gobble_128_loop - ADDQ R8, DI - ADDQ SI, DI - VMOVDQU Y4, (R11) + ADDQ R9, R8 + ADDQ DI, R8 + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(DI) - MOVOU X6, -112(DI) - MOVOU X7, -96(DI) - MOVOU X8, -80(DI) - MOVOU X9, -64(DI) - MOVOU X10, -48(DI) - MOVOU X11, -32(DI) - MOVOU X12, -16(DI) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) JMP emit_literal_done_match_emit_encodeBlockAsmAvx - MOVQ R8, SI + MOVQ R9, DI emit_literal_done_match_emit_encodeBlockAsmAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) emit_literal_skip_match_emit_encodeBlockAsmAvx: NOP match_nolit_loop_encodeBlockAsmAvx: - MOVL AX, BP - MOVL AX, BP - SUBL BX, BP - MOVL BP, 24(SP) + MOVL AX, SI + MOVL AX, SI + SUBL BX, SI + MOVL SI, 24(SP) ADDL $0x04, AX ADDL $0x04, BX - MOVL 16(SP), BP - SUBL AX, BP - XORQ DI, DI - CMPQ BP, $0x08 + MOVL 16(SP), SI + SUBL AX, SI + XORQ R8, R8 + CMPQ SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsmAvx matchlen_loopback_match_nolit_encodeBlockAsmAvx: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_match_nolit_encodeBlockAsmAvx - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP match_nolit_end_encodeBlockAsmAvx matchlen_loop_match_nolit_encodeBlockAsmAvx: - LEAQ -8(BP), BP - LEAQ 8(DI), DI - CMPQ BP, $0x08 + LEAQ -8(SI), SI + LEAQ 8(R8), R8 + CMPQ SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsmAvx matchlen_single_match_nolit_encodeBlockAsmAvx: - TESTQ BP, BP + TESTQ SI, SI JZ match_nolit_end_encodeBlockAsmAvx matchlen_single_loopback_match_nolit_encodeBlockAsmAvx: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE match_nolit_end_encodeBlockAsmAvx - LEAQ 1(DI), DI - DECQ BP + LEAQ 1(R8), R8 + DECQ SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsmAvx match_nolit_end_encodeBlockAsmAvx: - MOVL 24(SP), BP - ADDQ $0x04, DI - MOVQ dst_base+0(FP), SI - ADDL DI, AX - CMPL BP, $0x00010000 + MOVL 24(SP), SI + ADDQ $0x04, R8 + MOVQ dst_base+0(FP), DI + ADDL R8, AX + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsmAvx - CMPL DI, $0x40 + CMPL R8, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsmAvx - MOVB $0xff, (SI) - MOVD BP, 1(SI) - LEAQ -64(DI), DI - ADDQ $0x05, SI - CMPL DI, $0x04 + MOVB $0xff, (DI) + MOVD SI, 1(DI) + LEAQ -64(R8), R8 + ADDQ $0x05, DI + CMPL R8, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsmAvx emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx four_bytes_remain_match_nolit_encodeBlockAsmAvx: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_emitcopy_end_encodeBlockAsmAvx MOVB $0x03, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx two_byte_offset_match_nolit_encodeBlockAsmAvx: - CMPL DI, $0x40 + CMPL R8, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsmAvx - MOVB $0xee, (SI) - MOVW BP, 1(SI) - LEAQ -60(DI), DI - ADDQ $0x03, SI + MOVB $0xee, (DI) + MOVW SI, 1(DI) + LEAQ -60(R8), R8 + ADDQ $0x03, DI emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy_short: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy_short - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy_short - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy_short - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy_short - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy_short repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy_short: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy_short: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy_short: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy_short: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx two_byte_offset_short_match_nolit_encodeBlockAsmAvx: - CMPL DI, $0x0c + CMPL R8, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsmAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsmAvx MOVB $0x01, DL - LEAQ -16(DX)(DI*4), DI - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + LEAQ -16(DX)(R8*4), R8 + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsmAvx emit_copy_three_match_nolit_encodeBlockAsmAvx: MOVB $0x02, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI match_nolit_emitcopy_end_encodeBlockAsmAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL AX, 20(SP) CMPL AX, 16(SP) JGE emit_remainder_encodeBlockAsmAvx - CMPQ SI, (SP) + CMPQ DI, (SP) JL match_nolit_dst_ok_encodeBlockAsmAvx MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsmAvx: - MOVQ -2(CX)(AX*1), BP - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ BP, DI - SHRQ $0x10, BP - MOVQ BP, R8 - SHLQ $0x10, DI - IMULQ SI, DI - SHRQ $0x30, DI + MOVQ -2(CX)(AX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ SI, R8 + SHRQ $0x10, SI + MOVQ SI, R9 SHLQ $0x10, R8 - IMULQ SI, R8 + IMULQ DI, R8 SHRQ $0x30, R8 - MOVL 32(SP)(DI*1), SI - MOVL 32(SP)(R8*1), SI - LEAQ -2(AX), SI - MOVL SI, 32(SP)(DI*1) - MOVL AX, 32(SP)(R8*1) - CMPL (CX)(R8*1), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x30, R9 + MOVL 32(SP)(R8*1), DI + MOVL 32(SP)(R9*1), DI + LEAQ -2(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, 32(SP)(R9*1) + CMPL (CX)(R9*1), SI JEQ match_nolit_loop_encodeBlockAsmAvx INCL AX JMP search_loop_encodeBlockAsmAvx @@ -5582,11 +5582,11 @@ emit_remainder_ok_encodeBlockAsmAvx: JMP memmove_emit_remainder_encodeBlockAsmAvx four_bytes_emit_remainder_encodeBlockAsmAvx: - MOVQ DX, BP - SHRL $0x10, BP + MOVQ DX, SI + SHRL $0x10, SI MOVB $0xf8, (CX) MOVW DX, 1(CX) - MOVB BP, 3(CX) + MOVB SI, 3(CX) ADDQ $0x04, CX JMP memmove_emit_remainder_encodeBlockAsmAvx @@ -5636,9 +5636,9 @@ emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_tail: emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_1or2: MOVB (AX), DL - MOVB -1(AX)(BX*1), BP + MOVB -1(AX)(BX*1), SI MOVB DL, (CX) - MOVB BP, -1(CX)(BX*1) + MOVB SI, -1(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_4: @@ -5648,16 +5648,16 @@ emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_4: emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_3: MOVW (AX), DX - MOVB 2(AX), BP + MOVB 2(AX), SI MOVW DX, (CX) - MOVB BP, 2(CX) + MOVB SI, 2(CX) JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_5through7: MOVL (AX), DX - MOVL -4(AX)(BX*1), BP + MOVL -4(AX)(BX*1), SI MOVL DX, (CX) - MOVL BP, -4(CX)(BX*1) + MOVL SI, -4(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_8: @@ -5667,9 +5667,9 @@ emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_8: emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_9through16: MOVQ (AX), DX - MOVQ -8(AX)(BX*1), BP + MOVQ -8(AX)(BX*1), SI MOVQ DX, (CX) - MOVQ BP, -8(CX)(BX*1) + MOVQ SI, -8(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsmAvx emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_17through32: @@ -5785,24 +5785,24 @@ emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_move_256through2048: JMP emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_tail emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_avxUnaligned: - LEAQ (AX)(BX*1), BP - MOVQ CX, DI - MOVOU -128(BP), X5 - MOVOU -112(BP), X6 + LEAQ (AX)(BX*1), SI + MOVQ CX, R8 + MOVOU -128(SI), X5 + MOVOU -112(SI), X6 MOVQ $0x00000080, DX ANDQ $0xffffffe0, CX ADDQ $0x20, CX - MOVOU -96(BP), X7 - MOVOU -80(BP), X8 - MOVQ CX, SI - SUBQ DI, SI - MOVOU -64(BP), X9 - MOVOU -48(BP), X10 - SUBQ SI, BX - MOVOU -32(BP), X11 - MOVOU -16(BP), X12 + MOVOU -96(SI), X7 + MOVOU -80(SI), X8 + MOVQ CX, DI + SUBQ R8, DI + MOVOU -64(SI), X9 + MOVOU -48(SI), X10 + SUBQ DI, BX + MOVOU -32(SI), X11 + MOVOU -16(SI), X12 VMOVDQU (AX), Y4 - ADDQ SI, AX + ADDQ DI, AX SUBQ DX, BX emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_gobble_128_loop: @@ -5820,7 +5820,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_gobble_128_loop: JA emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_gobble_128_loop ADDQ DX, BX ADDQ CX, BX - VMOVDQU Y4, (DI) + VMOVDQU Y4, (R8) VZEROUPPER MOVOU X5, -128(BX) MOVOU X6, -112(BX) @@ -5877,224 +5877,224 @@ zero_loop_encodeBlockAsm14BAvx: MOVQ src_base+24(FP), CX search_loop_encodeBlockAsm14BAvx: - MOVQ (CX)(AX*1), BP + MOVQ (CX)(AX*1), SI MOVL AX, BX SUBL 20(SP), BX SHRL $0x05, BX LEAQ 4(AX)(BX*1), BX - MOVL 16(SP), SI - CMPL BX, SI + MOVL 16(SP), DI + CMPL BX, DI JGT emit_remainder_encodeBlockAsm14BAvx MOVL BX, 28(SP) MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ BP, DI - MOVQ BP, R8 - SHRQ $0x08, R8 - SHLQ $0x10, DI - IMULQ BX, DI - SHRQ $0x32, DI + MOVQ SI, R8 + MOVQ SI, R9 + SHRQ $0x08, R9 SHLQ $0x10, R8 IMULQ BX, R8 SHRQ $0x32, R8 - MOVL 32(SP)(DI*1), BX - MOVL 32(SP)(R8*1), SI - MOVL AX, 32(SP)(DI*1) - LEAL 1(AX), DI - MOVL DI, 32(SP)(R8*1) - MOVL AX, DI - SUBL 24(SP), DI - MOVL 1(CX)(DI*1), R9 - MOVQ BP, R8 - SHLQ $0x08, R8 - CMPL R8, R9 + SHLQ $0x10, R9 + IMULQ BX, R9 + SHRQ $0x32, R9 + MOVL 32(SP)(R8*1), BX + MOVL 32(SP)(R9*1), DI + MOVL AX, 32(SP)(R8*1) + LEAL 1(AX), R8 + MOVL R8, 32(SP)(R9*1) + MOVL AX, R8 + SUBL 24(SP), R8 + MOVL 1(CX)(R8*1), R10 + MOVQ SI, R9 + SHLQ $0x08, R9 + CMPL R9, R10 JNE no_repeat_found_encodeBlockAsm14BAvx - LEAQ 1(AX), BP + LEAQ 1(AX), SI MOVL 20(SP), BX - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_back_end_encodeBlockAsm14BAvx repeat_extend_back_loop_encodeBlockAsm14BAvx: - CMPL BP, BX + CMPL SI, BX JG repeat_extend_back_end_encodeBlockAsm14BAvx - MOVB -1(CX)(DI*1), DL - MOVB -1(CX)(BP*1), SI - CMPB DL, SI + MOVB -1(CX)(R8*1), DL + MOVB -1(CX)(SI*1), DI + CMPB DL, DI JNE repeat_extend_back_end_encodeBlockAsm14BAvx - LEAQ -1(BP), BP - DECL DI + LEAQ -1(SI), SI + DECL R8 JZ repeat_extend_back_end_encodeBlockAsm14BAvx JMP repeat_extend_back_loop_encodeBlockAsm14BAvx repeat_extend_back_end_encodeBlockAsm14BAvx: MOVL 20(SP), BX - CMPL BX, BP + CMPL BX, SI JEQ emit_literal_skip_repeat_emit_encodeBlockAsm14BAvx - MOVL BP, SI - MOVL BP, 20(SP) - LEAQ (CX)(BX*1), DI - SUBL BX, SI + MOVL SI, DI + MOVL SI, 20(SP) + LEAQ (CX)(BX*1), R8 + SUBL BX, DI MOVQ dst_base+0(FP), BX - MOVQ SI, R8 - SUBL $0x01, R8 + MOVQ DI, R9 + SUBL $0x01, R9 JC emit_literal_done_repeat_emit_encodeBlockAsm14BAvx - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm14BAvx - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm14BAvx - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm14BAvx - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_repeat_emit_encodeBlockAsm14BAvx MOVB $0xfc, (BX) - MOVL R8, 1(BX) + MOVL R9, 1(BX) ADDQ $0x05, BX JMP memmove_repeat_emit_encodeBlockAsm14BAvx four_bytes_repeat_emit_encodeBlockAsm14BAvx: - MOVQ R8, R9 - SHRL $0x10, R9 + MOVQ R9, R10 + SHRL $0x10, R10 MOVB $0xf8, (BX) - MOVW R8, 1(BX) - MOVB R9, 3(BX) + MOVW R9, 1(BX) + MOVB R10, 3(BX) ADDQ $0x04, BX JMP memmove_repeat_emit_encodeBlockAsm14BAvx three_bytes_repeat_emit_encodeBlockAsm14BAvx: MOVB $0xf4, (BX) - MOVW R8, 1(BX) + MOVW R9, 1(BX) ADDQ $0x03, BX JMP memmove_repeat_emit_encodeBlockAsm14BAvx two_bytes_repeat_emit_encodeBlockAsm14BAvx: MOVB $0xf0, (BX) - MOVB R8, 1(BX) + MOVB R9, 1(BX) ADDQ $0x02, BX JMP memmove_repeat_emit_encodeBlockAsm14BAvx one_byte_repeat_emit_encodeBlockAsm14BAvx: - SHLB $0x02, R8 - MOVB R8, (BX) + SHLB $0x02, R9 + MOVB R9, (BX) ADDQ $0x01, BX memmove_repeat_emit_encodeBlockAsm14BAvx: - LEAQ (BX)(SI*1), R8 + LEAQ (BX)(DI*1), R9 NOP emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_tail: - TESTQ SI, SI + TESTQ DI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm14BAvx - CMPQ SI, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_1or2 - CMPQ SI, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_4 - CMPQ SI, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_8 - CMPQ SI, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_9through16 - CMPQ SI, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_17through32 - CMPQ SI, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_33through64 - CMPQ SI, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_65through128 - CMPQ SI, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_1or2: - MOVB (DI), R8 - MOVB -1(DI)(SI*1), R9 - MOVB R8, (BX) - MOVB R9, -1(BX)(SI*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R10 + MOVB R9, (BX) + MOVB R10, -1(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_4: - MOVL (DI), R8 - MOVL R8, (BX) + MOVL (R8), R9 + MOVL R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_3: - MOVW (DI), R8 - MOVB 2(DI), R9 - MOVW R8, (BX) - MOVB R9, 2(BX) + MOVW (R8), R9 + MOVB 2(R8), R10 + MOVW R9, (BX) + MOVB R10, 2(BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_5through7: - MOVL (DI), R8 - MOVL -4(DI)(SI*1), R9 - MOVL R8, (BX) - MOVL R9, -4(BX)(SI*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R10 + MOVL R9, (BX) + MOVL R10, -4(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_8: - MOVQ (DI), R8 - MOVQ R8, (BX) + MOVQ (R8), R9 + MOVQ R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_9through16: - MOVQ (DI), R8 - MOVQ -8(DI)(SI*1), R9 - MOVQ R8, (BX) - MOVQ R9, -8(BX)(SI*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R10 + MOVQ R9, (BX) + MOVQ R10, -8(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(SI*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (BX) - MOVOU X1, -16(BX)(SI*1) + MOVOU X1, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(SI*1), X2 - MOVOU -16(DI)(SI*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (BX) MOVOU X1, 16(BX) - MOVOU X2, -32(BX)(SI*1) - MOVOU X3, -16(BX)(SI*1) + MOVOU X2, -32(BX)(DI*1) + MOVOU X3, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_65through128: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) MOVOU X3, 48(BX) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_129through256: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU -128(DI)(SI*1), X8 - MOVOU -112(DI)(SI*1), X9 - MOVOU -96(DI)(SI*1), X10 - MOVOU -80(DI)(SI*1), X11 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -6103,34 +6103,34 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_129through256: MOVOU X5, 80(BX) MOVOU X6, 96(BX) MOVOU X7, 112(BX) - MOVOU X8, -128(BX)(SI*1) - MOVOU X9, -112(BX)(SI*1) - MOVOU X10, -96(BX)(SI*1) - MOVOU X11, -80(BX)(SI*1) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X8, -128(BX)(DI*1) + MOVOU X9, -112(BX)(DI*1) + MOVOU X10, -96(BX)(DI*1) + MOVOU X11, -80(BX)(DI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_256through2048: - LEAQ -256(SI), SI - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU 128(DI), X8 - MOVOU 144(DI), X9 - MOVOU 160(DI), X10 - MOVOU 176(DI), X11 - MOVOU 192(DI), X12 - MOVOU 208(DI), X13 - MOVOU 224(DI), X14 - MOVOU 240(DI), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -6147,60 +6147,60 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_256through2048: MOVOU X13, 208(BX) MOVOU X14, 224(BX) MOVOU X15, 240(BX) - CMPQ SI, $0x00000100 - LEAQ 256(DI), DI + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(BX), BX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_avxUnaligned: - LEAQ (DI)(SI*1), R9 - MOVQ BX, R11 - MOVOU -128(R9), X5 - MOVOU -112(R9), X6 - MOVQ $0x00000080, R8 + LEAQ (R8)(DI*1), R10 + MOVQ BX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, BX ADDQ $0x20, BX - MOVOU -96(R9), X7 - MOVOU -80(R9), X8 - MOVQ BX, R10 - SUBQ R11, R10 - MOVOU -64(R9), X9 - MOVOU -48(R9), X10 - SUBQ R10, SI - MOVOU -32(R9), X11 - MOVOU -16(R9), X12 - VMOVDQU (DI), Y4 - ADDQ R10, DI - SUBQ R8, SI + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ BX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, DI + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (R8), Y4 + ADDQ R11, R8 + SUBQ R9, DI emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_gobble_128_loop: - VMOVDQU (DI), Y0 - VMOVDQU 32(DI), Y1 - VMOVDQU 64(DI), Y2 - VMOVDQU 96(DI), Y3 - ADDQ R8, DI + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU 64(R8), Y2 + VMOVDQU 96(R8), Y3 + ADDQ R9, R8 VMOVDQA Y0, (BX) VMOVDQA Y1, 32(BX) VMOVDQA Y2, 64(BX) VMOVDQA Y3, 96(BX) - ADDQ R8, BX - SUBQ R8, SI + ADDQ R9, BX + SUBQ R9, DI JA emit_lit_memmove_repeat_emit_encodeBlockAsm14BAvx_memmove_gobble_128_loop - ADDQ R8, SI - ADDQ BX, SI - VMOVDQU Y4, (R11) + ADDQ R9, DI + ADDQ BX, DI + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(SI) - MOVOU X6, -112(SI) - MOVOU X7, -96(SI) - MOVOU X8, -80(SI) - MOVOU X9, -64(SI) - MOVOU X10, -48(SI) - MOVOU X11, -32(SI) - MOVOU X12, -16(SI) + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) JMP emit_literal_done_repeat_emit_encodeBlockAsm14BAvx - MOVQ R8, BX + MOVQ R9, BX emit_literal_done_repeat_emit_encodeBlockAsm14BAvx: MOVQ BX, dst_base+0(FP) @@ -6211,23 +6211,23 @@ emit_literal_skip_repeat_emit_encodeBlockAsm14BAvx: SUBL 24(SP), BX MOVL 16(SP), BX SUBL AX, BX - XORQ DI, DI + XORQ R8, R8 CMPQ BX, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_repeat_extend - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP repeat_extend_forward_end_encodeBlockAsm14BAvx matchlen_loop_repeat_extend: LEAQ -8(BX), BX - LEAQ 8(DI), DI + LEAQ 8(R8), R8 CMPQ BX, $0x08 JGE matchlen_loopback_repeat_extend @@ -6236,31 +6236,31 @@ matchlen_single_repeat_extend: JZ repeat_extend_forward_end_encodeBlockAsm14BAvx matchlen_single_loopback_repeat_extend: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE repeat_extend_forward_end_encodeBlockAsm14BAvx - LEAQ 1(DI), DI + LEAQ 1(R8), R8 DECQ BX JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm14BAvx: - ADDL DI, AX + ADDL R8, AX MOVL AX, BX - SUBL BP, BX - MOVL 24(SP), BP - MOVQ dst_base+0(FP), SI - MOVL 20(SP), DI - TESTL DI, DI + SUBL SI, BX + MOVL 24(SP), SI + MOVQ dst_base+0(FP), DI + MOVL 20(SP), R8 + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm14BAvx emit_repeat_again_match_repeat_: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_match_repeat_ - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_ - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_ cant_repeat_two_offset_match_repeat_: @@ -6271,74 +6271,74 @@ cant_repeat_two_offset_match_repeat_: CMPL BX, $0x0100ffff JLT repeat_five_match_repeat_ LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_repeat_ repeat_five_match_repeat_: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_four_match_repeat_: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_three_match_repeat_: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_two_match_repeat_: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_two_offset_match_repeat_: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_as_copy_encodeBlockAsm14BAvx: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm14BAvx CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm14BAvx - MOVB $0xff, (SI) - MOVD BP, 1(SI) + MOVB $0xff, (DI) + MOVD SI, 1(DI) LEAQ -64(BX), BX - ADDQ $0x05, SI + ADDQ $0x05, DI CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm14BAvx emit_repeat_again_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: @@ -6349,52 +6349,52 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_four_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_three_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_two_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14BAvx four_bytes_remain_repeat_as_copy_encodeBlockAsm14BAvx: @@ -6402,27 +6402,27 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm14BAvx: JZ repeat_end_emit_encodeBlockAsm14BAvx MOVB $0x03, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + MOVB BL, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm14BAvx two_byte_offset_repeat_as_copy_encodeBlockAsm14BAvx: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm14BAvx - MOVB $0xee, (SI) - MOVW BP, 1(SI) + MOVB $0xee, (DI) + MOVW SI, 1(DI) LEAQ -60(BX), BX - ADDQ $0x03, SI + ADDQ $0x03, DI emit_repeat_again_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: @@ -6433,100 +6433,100 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_four_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_three_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_two_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm14BAvx_emit_copy_short: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14BAvx two_byte_offset_short_repeat_as_copy_encodeBlockAsm14BAvx: CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm14BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm14BAvx MOVB $0x01, DL LEAQ -16(DX)(BX*4), BX - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm14BAvx emit_copy_three_repeat_as_copy_encodeBlockAsm14BAvx: MOVB $0x02, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + MOVB BL, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI repeat_end_emit_encodeBlockAsm14BAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL 16(SP), BX CMPL AX, BX JGT emit_remainder_encodeBlockAsm14BAvx JMP search_loop_encodeBlockAsm14BAvx no_repeat_found_encodeBlockAsm14BAvx: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ BP, DI - SHRQ $0x10, DI - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - CMPL (CX)(BX*1), BP - SHRQ $0x08, BP + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ SI, R8 + SHRQ $0x10, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + CMPL (CX)(BX*1), SI + SHRQ $0x08, SI JEQ candidate_match_encodeBlockAsm14BAvx - MOVL 32(SP)(DI*1), BX - CMPL (CX)(SI*1), BP + MOVL 32(SP)(R8*1), BX + CMPL (CX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm14BAvx - LEAQ 2(AX), SI - MOVL SI, 32(SP)(DI*1) - SHRQ $0x08, BP - CMPL (CX)(BX*1), BP + LEAQ 2(AX), DI + MOVL DI, 32(SP)(R8*1) + SHRQ $0x08, SI + CMPL (CX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm14BAvx MOVL 28(SP), AX JMP search_loop_encodeBlockAsm14BAvx @@ -6537,21 +6537,21 @@ candidate3_match_encodeBlockAsm14BAvx: candidate2_match_encodeBlockAsm14BAvx: LEAQ -2(AX), BX - MOVL BX, 32(SP)(DI*1) + MOVL BX, 32(SP)(R8*1) INCL AX - MOVL SI, BX + MOVL DI, BX candidate_match_encodeBlockAsm14BAvx: - MOVL 20(SP), BP + MOVL 20(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm14BAvx match_extend_back_loop_encodeBlockAsm14BAvx: - CMPL AX, BP + CMPL AX, SI JG match_extend_back_end_encodeBlockAsm14BAvx MOVB -1(CX)(BX*1), DL - MOVB -1(CX)(AX*1), SI - CMPB DL, SI + MOVB -1(CX)(AX*1), DI + CMPB DL, DI JNE match_extend_back_end_encodeBlockAsm14BAvx LEAL -1(AX), AX DECL BX @@ -6559,555 +6559,555 @@ match_extend_back_loop_encodeBlockAsm14BAvx: JMP match_extend_back_loop_encodeBlockAsm14BAvx match_extend_back_end_encodeBlockAsm14BAvx: - MOVL AX, BP - SUBL 20(SP), BP - LEAQ dst_base+0(FP)(BP*1), BP - CMPQ BP, (SP) + MOVL AX, SI + SUBL 20(SP), SI + LEAQ dst_base+0(FP)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm14BAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm14BAvx: - MOVL BX, BP - MOVL 20(SP), SI - CMPL SI, BP + MOVL BX, SI + MOVL 20(SP), DI + CMPL DI, SI JEQ emit_literal_skip_match_emit_encodeBlockAsm14BAvx - MOVL BP, DI - MOVL BP, 20(SP) - LEAQ (CX)(SI*1), BP - SUBL SI, DI - MOVQ dst_base+0(FP), SI - MOVQ DI, R8 - SUBL $0x01, R8 + MOVL SI, R8 + MOVL SI, 20(SP) + LEAQ (CX)(DI*1), SI + SUBL DI, R8 + MOVQ dst_base+0(FP), DI + MOVQ R8, R9 + SUBL $0x01, R9 JC emit_literal_done_match_emit_encodeBlockAsm14BAvx - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_match_emit_encodeBlockAsm14BAvx - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm14BAvx - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsm14BAvx - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_match_emit_encodeBlockAsm14BAvx - MOVB $0xfc, (SI) - MOVL R8, 1(SI) - ADDQ $0x05, SI + MOVB $0xfc, (DI) + MOVL R9, 1(DI) + ADDQ $0x05, DI JMP memmove_match_emit_encodeBlockAsm14BAvx four_bytes_match_emit_encodeBlockAsm14BAvx: - MOVQ R8, R9 - SHRL $0x10, R9 - MOVB $0xf8, (SI) - MOVW R8, 1(SI) - MOVB R9, 3(SI) - ADDQ $0x04, SI + MOVQ R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (DI) + MOVW R9, 1(DI) + MOVB R10, 3(DI) + ADDQ $0x04, DI JMP memmove_match_emit_encodeBlockAsm14BAvx three_bytes_match_emit_encodeBlockAsm14BAvx: - MOVB $0xf4, (SI) - MOVW R8, 1(SI) - ADDQ $0x03, SI + MOVB $0xf4, (DI) + MOVW R9, 1(DI) + ADDQ $0x03, DI JMP memmove_match_emit_encodeBlockAsm14BAvx two_bytes_match_emit_encodeBlockAsm14BAvx: - MOVB $0xf0, (SI) - MOVB R8, 1(SI) - ADDQ $0x02, SI + MOVB $0xf0, (DI) + MOVB R9, 1(DI) + ADDQ $0x02, DI JMP memmove_match_emit_encodeBlockAsm14BAvx one_byte_match_emit_encodeBlockAsm14BAvx: - SHLB $0x02, R8 - MOVB R8, (SI) - ADDQ $0x01, SI + SHLB $0x02, R9 + MOVB R9, (DI) + ADDQ $0x01, DI memmove_match_emit_encodeBlockAsm14BAvx: - LEAQ (SI)(DI*1), R8 + LEAQ (DI)(R8*1), R9 NOP emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_tail: - TESTQ DI, DI + TESTQ R8, R8 JEQ emit_literal_done_match_emit_encodeBlockAsm14BAvx - CMPQ DI, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_1or2 - CMPQ DI, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_3 JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_4 - CMPQ DI, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_5through7 JE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_8 - CMPQ DI, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_9through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_17through32 - CMPQ DI, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_33through64 - CMPQ DI, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_65through128 - CMPQ DI, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_129through256 JMP emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_1or2: - MOVB (BP), R8 - MOVB -1(BP)(DI*1), R9 - MOVB R8, (SI) - MOVB R9, -1(SI)(DI*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (DI) + MOVB R10, -1(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_4: - MOVL (BP), R8 - MOVL R8, (SI) + MOVL (SI), R9 + MOVL R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_3: - MOVW (BP), R8 - MOVB 2(BP), R9 - MOVW R8, (SI) - MOVB R9, 2(SI) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (DI) + MOVB R10, 2(DI) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_5through7: - MOVL (BP), R8 - MOVL -4(BP)(DI*1), R9 - MOVL R8, (SI) - MOVL R9, -4(SI)(DI*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (DI) + MOVL R10, -4(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_8: - MOVQ (BP), R8 - MOVQ R8, (SI) + MOVQ (SI), R9 + MOVQ R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_9through16: - MOVQ (BP), R8 - MOVQ -8(BP)(DI*1), R9 - MOVQ R8, (SI) - MOVQ R9, -8(SI)(DI*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (DI) + MOVQ R10, -8(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_17through32: - MOVOU (BP), X0 - MOVOU -16(BP)(DI*1), X1 - MOVOU X0, (SI) - MOVOU X1, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (DI) + MOVOU X1, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_33through64: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU -32(BP)(DI*1), X2 - MOVOU -16(BP)(DI*1), X3 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, -32(SI)(DI*1) - MOVOU X3, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, -32(DI)(R8*1) + MOVOU X3, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_65through128: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_129through256: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU -128(BP)(DI*1), X8 - MOVOU -112(BP)(DI*1), X9 - MOVOU -96(BP)(DI*1), X10 - MOVOU -80(BP)(DI*1), X11 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, -128(SI)(DI*1) - MOVOU X9, -112(SI)(DI*1) - MOVOU X10, -96(SI)(DI*1) - MOVOU X11, -80(SI)(DI*1) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, -128(DI)(R8*1) + MOVOU X9, -112(DI)(R8*1) + MOVOU X10, -96(DI)(R8*1) + MOVOU X11, -80(DI)(R8*1) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_256through2048: - LEAQ -256(DI), DI - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU 128(BP), X8 - MOVOU 144(BP), X9 - MOVOU 160(BP), X10 - MOVOU 176(BP), X11 - MOVOU 192(BP), X12 - MOVOU 208(BP), X13 - MOVOU 224(BP), X14 - MOVOU 240(BP), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, 128(SI) - MOVOU X9, 144(SI) - MOVOU X10, 160(SI) - MOVOU X11, 176(SI) - MOVOU X12, 192(SI) - MOVOU X13, 208(SI) - MOVOU X14, 224(SI) - MOVOU X15, 240(SI) - CMPQ DI, $0x00000100 - LEAQ 256(BP), BP + LEAQ -256(R8), R8 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, 128(DI) + MOVOU X9, 144(DI) + MOVOU X10, 160(DI) + MOVOU X11, 176(DI) + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + CMPQ R8, $0x00000100 LEAQ 256(SI), SI + LEAQ 256(DI), DI JGE emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_tail emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_avxUnaligned: - LEAQ (BP)(DI*1), R9 - MOVQ SI, R11 - MOVOU -128(R9), X5 - MOVOU -112(R9), X6 - MOVQ $0x00000080, R8 - ANDQ $0xffffffe0, SI - ADDQ $0x20, SI - MOVOU -96(R9), X7 - MOVOU -80(R9), X8 - MOVQ SI, R10 - SUBQ R11, R10 - MOVOU -64(R9), X9 - MOVOU -48(R9), X10 - SUBQ R10, DI - MOVOU -32(R9), X11 - MOVOU -16(R9), X12 - VMOVDQU (BP), Y4 - ADDQ R10, BP - SUBQ R8, DI + LEAQ (SI)(R8*1), R10 + MOVQ DI, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 + ANDQ $0xffffffe0, DI + ADDQ $0x20, DI + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ DI, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_gobble_128_loop: - VMOVDQU (BP), Y0 - VMOVDQU 32(BP), Y1 - VMOVDQU 64(BP), Y2 - VMOVDQU 96(BP), Y3 - ADDQ R8, BP - VMOVDQA Y0, (SI) - VMOVDQA Y1, 32(SI) - VMOVDQA Y2, 64(SI) - VMOVDQA Y3, 96(SI) - ADDQ R8, SI - SUBQ R8, DI + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI + VMOVDQA Y0, (DI) + VMOVDQA Y1, 32(DI) + VMOVDQA Y2, 64(DI) + VMOVDQA Y3, 96(DI) + ADDQ R9, DI + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeBlockAsm14BAvx_memmove_gobble_128_loop - ADDQ R8, DI - ADDQ SI, DI - VMOVDQU Y4, (R11) + ADDQ R9, R8 + ADDQ DI, R8 + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(DI) - MOVOU X6, -112(DI) - MOVOU X7, -96(DI) - MOVOU X8, -80(DI) - MOVOU X9, -64(DI) - MOVOU X10, -48(DI) - MOVOU X11, -32(DI) - MOVOU X12, -16(DI) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) JMP emit_literal_done_match_emit_encodeBlockAsm14BAvx - MOVQ R8, SI + MOVQ R9, DI emit_literal_done_match_emit_encodeBlockAsm14BAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) emit_literal_skip_match_emit_encodeBlockAsm14BAvx: NOP match_nolit_loop_encodeBlockAsm14BAvx: - MOVL AX, BP - MOVL AX, BP - SUBL BX, BP - MOVL BP, 24(SP) + MOVL AX, SI + MOVL AX, SI + SUBL BX, SI + MOVL SI, 24(SP) ADDL $0x04, AX ADDL $0x04, BX - MOVL 16(SP), BP - SUBL AX, BP - XORQ DI, DI - CMPQ BP, $0x08 + MOVL 16(SP), SI + SUBL AX, SI + XORQ R8, R8 + CMPQ SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm14BAvx matchlen_loopback_match_nolit_encodeBlockAsm14BAvx: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_match_nolit_encodeBlockAsm14BAvx - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP match_nolit_end_encodeBlockAsm14BAvx matchlen_loop_match_nolit_encodeBlockAsm14BAvx: - LEAQ -8(BP), BP - LEAQ 8(DI), DI - CMPQ BP, $0x08 + LEAQ -8(SI), SI + LEAQ 8(R8), R8 + CMPQ SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm14BAvx matchlen_single_match_nolit_encodeBlockAsm14BAvx: - TESTQ BP, BP + TESTQ SI, SI JZ match_nolit_end_encodeBlockAsm14BAvx matchlen_single_loopback_match_nolit_encodeBlockAsm14BAvx: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE match_nolit_end_encodeBlockAsm14BAvx - LEAQ 1(DI), DI - DECQ BP + LEAQ 1(R8), R8 + DECQ SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm14BAvx match_nolit_end_encodeBlockAsm14BAvx: - MOVL 24(SP), BP - ADDQ $0x04, DI - MOVQ dst_base+0(FP), SI - ADDL DI, AX - CMPL BP, $0x00010000 + MOVL 24(SP), SI + ADDQ $0x04, R8 + MOVQ dst_base+0(FP), DI + ADDL R8, AX + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm14BAvx - CMPL DI, $0x40 + CMPL R8, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm14BAvx - MOVB $0xff, (SI) - MOVD BP, 1(SI) - LEAQ -64(DI), DI - ADDQ $0x05, SI - CMPL DI, $0x04 + MOVB $0xff, (DI) + MOVD SI, 1(DI) + LEAQ -64(R8), R8 + ADDQ $0x05, DI + CMPL R8, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm14BAvx emit_repeat_again_match_nolit_encodeBlockAsm14BAvx_emit_copy: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm14BAvx_emit_copy - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm14BAvx_emit_copy - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm14BAvx_emit_copy - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm14BAvx_emit_copy - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm14BAvx_emit_copy repeat_five_match_nolit_encodeBlockAsm14BAvx_emit_copy: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx repeat_four_match_nolit_encodeBlockAsm14BAvx_emit_copy: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx repeat_three_match_nolit_encodeBlockAsm14BAvx_emit_copy: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx repeat_two_match_nolit_encodeBlockAsm14BAvx_emit_copy: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx four_bytes_remain_match_nolit_encodeBlockAsm14BAvx: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_emitcopy_end_encodeBlockAsm14BAvx MOVB $0x03, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx two_byte_offset_match_nolit_encodeBlockAsm14BAvx: - CMPL DI, $0x40 + CMPL R8, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm14BAvx - MOVB $0xee, (SI) - MOVW BP, 1(SI) - LEAQ -60(DI), DI - ADDQ $0x03, SI + MOVB $0xee, (DI) + MOVW SI, 1(DI) + LEAQ -60(R8), R8 + ADDQ $0x03, DI emit_repeat_again_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm14BAvx_emit_copy_short - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm14BAvx_emit_copy_short - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm14BAvx_emit_copy_short - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm14BAvx_emit_copy_short - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm14BAvx_emit_copy_short repeat_five_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx repeat_four_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx repeat_three_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx repeat_two_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx repeat_two_offset_match_nolit_encodeBlockAsm14BAvx_emit_copy_short: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx two_byte_offset_short_match_nolit_encodeBlockAsm14BAvx: - CMPL DI, $0x0c + CMPL R8, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm14BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm14BAvx MOVB $0x01, DL - LEAQ -16(DX)(DI*4), DI - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + LEAQ -16(DX)(R8*4), R8 + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm14BAvx emit_copy_three_match_nolit_encodeBlockAsm14BAvx: MOVB $0x02, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI match_nolit_emitcopy_end_encodeBlockAsm14BAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL AX, 20(SP) CMPL AX, 16(SP) JGE emit_remainder_encodeBlockAsm14BAvx - CMPQ SI, (SP) + CMPQ DI, (SP) JL match_nolit_dst_ok_encodeBlockAsm14BAvx MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm14BAvx: - MOVQ -2(CX)(AX*1), BP - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ BP, DI - SHRQ $0x10, BP - MOVQ BP, R8 - SHLQ $0x10, DI - IMULQ SI, DI - SHRQ $0x32, DI + MOVQ -2(CX)(AX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ SI, R8 + SHRQ $0x10, SI + MOVQ SI, R9 SHLQ $0x10, R8 - IMULQ SI, R8 + IMULQ DI, R8 SHRQ $0x32, R8 - MOVL 32(SP)(DI*1), SI - MOVL 32(SP)(R8*1), SI - LEAQ -2(AX), SI - MOVL SI, 32(SP)(DI*1) - MOVL AX, 32(SP)(R8*1) - CMPL (CX)(R8*1), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x32, R9 + MOVL 32(SP)(R8*1), DI + MOVL 32(SP)(R9*1), DI + LEAQ -2(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, 32(SP)(R9*1) + CMPL (CX)(R9*1), SI JEQ match_nolit_loop_encodeBlockAsm14BAvx INCL AX JMP search_loop_encodeBlockAsm14BAvx @@ -7149,11 +7149,11 @@ emit_remainder_ok_encodeBlockAsm14BAvx: JMP memmove_emit_remainder_encodeBlockAsm14BAvx four_bytes_emit_remainder_encodeBlockAsm14BAvx: - MOVQ DX, BP - SHRL $0x10, BP + MOVQ DX, SI + SHRL $0x10, SI MOVB $0xf8, (CX) MOVW DX, 1(CX) - MOVB BP, 3(CX) + MOVB SI, 3(CX) ADDQ $0x04, CX JMP memmove_emit_remainder_encodeBlockAsm14BAvx @@ -7203,9 +7203,9 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_tail: emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_1or2: MOVB (AX), DL - MOVB -1(AX)(BX*1), BP + MOVB -1(AX)(BX*1), SI MOVB DL, (CX) - MOVB BP, -1(CX)(BX*1) + MOVB SI, -1(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_4: @@ -7215,16 +7215,16 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_4: emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_3: MOVW (AX), DX - MOVB 2(AX), BP + MOVB 2(AX), SI MOVW DX, (CX) - MOVB BP, 2(CX) + MOVB SI, 2(CX) JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_5through7: MOVL (AX), DX - MOVL -4(AX)(BX*1), BP + MOVL -4(AX)(BX*1), SI MOVL DX, (CX) - MOVL BP, -4(CX)(BX*1) + MOVL SI, -4(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_8: @@ -7234,9 +7234,9 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_8: emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_9through16: MOVQ (AX), DX - MOVQ -8(AX)(BX*1), BP + MOVQ -8(AX)(BX*1), SI MOVQ DX, (CX) - MOVQ BP, -8(CX)(BX*1) + MOVQ SI, -8(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsm14BAvx emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_17through32: @@ -7352,24 +7352,24 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_move_256through2048 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_tail emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_avxUnaligned: - LEAQ (AX)(BX*1), BP - MOVQ CX, DI - MOVOU -128(BP), X5 - MOVOU -112(BP), X6 + LEAQ (AX)(BX*1), SI + MOVQ CX, R8 + MOVOU -128(SI), X5 + MOVOU -112(SI), X6 MOVQ $0x00000080, DX ANDQ $0xffffffe0, CX ADDQ $0x20, CX - MOVOU -96(BP), X7 - MOVOU -80(BP), X8 - MOVQ CX, SI - SUBQ DI, SI - MOVOU -64(BP), X9 - MOVOU -48(BP), X10 - SUBQ SI, BX - MOVOU -32(BP), X11 - MOVOU -16(BP), X12 + MOVOU -96(SI), X7 + MOVOU -80(SI), X8 + MOVQ CX, DI + SUBQ R8, DI + MOVOU -64(SI), X9 + MOVOU -48(SI), X10 + SUBQ DI, BX + MOVOU -32(SI), X11 + MOVOU -16(SI), X12 VMOVDQU (AX), Y4 - ADDQ SI, AX + ADDQ DI, AX SUBQ DX, BX emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_gobble_128_loop: @@ -7387,7 +7387,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_gobble_128_loop: JA emit_lit_memmove_emit_remainder_encodeBlockAsm14BAvx_memmove_gobble_128_loop ADDQ DX, BX ADDQ CX, BX - VMOVDQU Y4, (DI) + VMOVDQU Y4, (R8) VZEROUPPER MOVOU X5, -128(BX) MOVOU X6, -112(BX) @@ -7444,224 +7444,224 @@ zero_loop_encodeBlockAsm12BAvx: MOVQ src_base+24(FP), CX search_loop_encodeBlockAsm12BAvx: - MOVQ (CX)(AX*1), BP + MOVQ (CX)(AX*1), SI MOVL AX, BX SUBL 20(SP), BX SHRL $0x04, BX LEAQ 4(AX)(BX*1), BX - MOVL 16(SP), SI - CMPL BX, SI + MOVL 16(SP), DI + CMPL BX, DI JGT emit_remainder_encodeBlockAsm12BAvx MOVL BX, 28(SP) MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ BP, DI - MOVQ BP, R8 - SHRQ $0x08, R8 - SHLQ $0x10, DI - IMULQ BX, DI - SHRQ $0x34, DI + MOVQ SI, R8 + MOVQ SI, R9 + SHRQ $0x08, R9 SHLQ $0x10, R8 IMULQ BX, R8 SHRQ $0x34, R8 - MOVL 32(SP)(DI*1), BX - MOVL 32(SP)(R8*1), SI - MOVL AX, 32(SP)(DI*1) - LEAL 1(AX), DI - MOVL DI, 32(SP)(R8*1) - MOVL AX, DI - SUBL 24(SP), DI - MOVL 1(CX)(DI*1), R9 - MOVQ BP, R8 - SHLQ $0x08, R8 - CMPL R8, R9 + SHLQ $0x10, R9 + IMULQ BX, R9 + SHRQ $0x34, R9 + MOVL 32(SP)(R8*1), BX + MOVL 32(SP)(R9*1), DI + MOVL AX, 32(SP)(R8*1) + LEAL 1(AX), R8 + MOVL R8, 32(SP)(R9*1) + MOVL AX, R8 + SUBL 24(SP), R8 + MOVL 1(CX)(R8*1), R10 + MOVQ SI, R9 + SHLQ $0x08, R9 + CMPL R9, R10 JNE no_repeat_found_encodeBlockAsm12BAvx - LEAQ 1(AX), BP + LEAQ 1(AX), SI MOVL 20(SP), BX - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_back_end_encodeBlockAsm12BAvx repeat_extend_back_loop_encodeBlockAsm12BAvx: - CMPL BP, BX + CMPL SI, BX JG repeat_extend_back_end_encodeBlockAsm12BAvx - MOVB -1(CX)(DI*1), DL - MOVB -1(CX)(BP*1), SI - CMPB DL, SI + MOVB -1(CX)(R8*1), DL + MOVB -1(CX)(SI*1), DI + CMPB DL, DI JNE repeat_extend_back_end_encodeBlockAsm12BAvx - LEAQ -1(BP), BP - DECL DI + LEAQ -1(SI), SI + DECL R8 JZ repeat_extend_back_end_encodeBlockAsm12BAvx JMP repeat_extend_back_loop_encodeBlockAsm12BAvx repeat_extend_back_end_encodeBlockAsm12BAvx: MOVL 20(SP), BX - CMPL BX, BP + CMPL BX, SI JEQ emit_literal_skip_repeat_emit_encodeBlockAsm12BAvx - MOVL BP, SI - MOVL BP, 20(SP) - LEAQ (CX)(BX*1), DI - SUBL BX, SI + MOVL SI, DI + MOVL SI, 20(SP) + LEAQ (CX)(BX*1), R8 + SUBL BX, DI MOVQ dst_base+0(FP), BX - MOVQ SI, R8 - SUBL $0x01, R8 + MOVQ DI, R9 + SUBL $0x01, R9 JC emit_literal_done_repeat_emit_encodeBlockAsm12BAvx - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm12BAvx - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm12BAvx - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm12BAvx - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_repeat_emit_encodeBlockAsm12BAvx MOVB $0xfc, (BX) - MOVL R8, 1(BX) + MOVL R9, 1(BX) ADDQ $0x05, BX JMP memmove_repeat_emit_encodeBlockAsm12BAvx four_bytes_repeat_emit_encodeBlockAsm12BAvx: - MOVQ R8, R9 - SHRL $0x10, R9 + MOVQ R9, R10 + SHRL $0x10, R10 MOVB $0xf8, (BX) - MOVW R8, 1(BX) - MOVB R9, 3(BX) + MOVW R9, 1(BX) + MOVB R10, 3(BX) ADDQ $0x04, BX JMP memmove_repeat_emit_encodeBlockAsm12BAvx three_bytes_repeat_emit_encodeBlockAsm12BAvx: MOVB $0xf4, (BX) - MOVW R8, 1(BX) + MOVW R9, 1(BX) ADDQ $0x03, BX JMP memmove_repeat_emit_encodeBlockAsm12BAvx two_bytes_repeat_emit_encodeBlockAsm12BAvx: MOVB $0xf0, (BX) - MOVB R8, 1(BX) + MOVB R9, 1(BX) ADDQ $0x02, BX JMP memmove_repeat_emit_encodeBlockAsm12BAvx one_byte_repeat_emit_encodeBlockAsm12BAvx: - SHLB $0x02, R8 - MOVB R8, (BX) + SHLB $0x02, R9 + MOVB R9, (BX) ADDQ $0x01, BX memmove_repeat_emit_encodeBlockAsm12BAvx: - LEAQ (BX)(SI*1), R8 + LEAQ (BX)(DI*1), R9 NOP emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_tail: - TESTQ SI, SI + TESTQ DI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm12BAvx - CMPQ SI, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_1or2 - CMPQ SI, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_4 - CMPQ SI, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_8 - CMPQ SI, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_9through16 - CMPQ SI, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_17through32 - CMPQ SI, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_33through64 - CMPQ SI, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_65through128 - CMPQ SI, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_1or2: - MOVB (DI), R8 - MOVB -1(DI)(SI*1), R9 - MOVB R8, (BX) - MOVB R9, -1(BX)(SI*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R10 + MOVB R9, (BX) + MOVB R10, -1(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_4: - MOVL (DI), R8 - MOVL R8, (BX) + MOVL (R8), R9 + MOVL R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_3: - MOVW (DI), R8 - MOVB 2(DI), R9 - MOVW R8, (BX) - MOVB R9, 2(BX) + MOVW (R8), R9 + MOVB 2(R8), R10 + MOVW R9, (BX) + MOVB R10, 2(BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_5through7: - MOVL (DI), R8 - MOVL -4(DI)(SI*1), R9 - MOVL R8, (BX) - MOVL R9, -4(BX)(SI*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R10 + MOVL R9, (BX) + MOVL R10, -4(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_8: - MOVQ (DI), R8 - MOVQ R8, (BX) + MOVQ (R8), R9 + MOVQ R9, (BX) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_9through16: - MOVQ (DI), R8 - MOVQ -8(DI)(SI*1), R9 - MOVQ R8, (BX) - MOVQ R9, -8(BX)(SI*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R10 + MOVQ R9, (BX) + MOVQ R10, -8(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(SI*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (BX) - MOVOU X1, -16(BX)(SI*1) + MOVOU X1, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(SI*1), X2 - MOVOU -16(DI)(SI*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (BX) MOVOU X1, 16(BX) - MOVOU X2, -32(BX)(SI*1) - MOVOU X3, -16(BX)(SI*1) + MOVOU X2, -32(BX)(DI*1) + MOVOU X3, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_65through128: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) MOVOU X3, 48(BX) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU -128(DI)(SI*1), X8 - MOVOU -112(DI)(SI*1), X9 - MOVOU -96(DI)(SI*1), X10 - MOVOU -80(DI)(SI*1), X11 - MOVOU -64(DI)(SI*1), X12 - MOVOU -48(DI)(SI*1), X13 - MOVOU -32(DI)(SI*1), X14 - MOVOU -16(DI)(SI*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -7670,34 +7670,34 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256: MOVOU X5, 80(BX) MOVOU X6, 96(BX) MOVOU X7, 112(BX) - MOVOU X8, -128(BX)(SI*1) - MOVOU X9, -112(BX)(SI*1) - MOVOU X10, -96(BX)(SI*1) - MOVOU X11, -80(BX)(SI*1) - MOVOU X12, -64(BX)(SI*1) - MOVOU X13, -48(BX)(SI*1) - MOVOU X14, -32(BX)(SI*1) - MOVOU X15, -16(BX)(SI*1) + MOVOU X8, -128(BX)(DI*1) + MOVOU X9, -112(BX)(DI*1) + MOVOU X10, -96(BX)(DI*1) + MOVOU X11, -80(BX)(DI*1) + MOVOU X12, -64(BX)(DI*1) + MOVOU X13, -48(BX)(DI*1) + MOVOU X14, -32(BX)(DI*1) + MOVOU X15, -16(BX)(DI*1) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: - LEAQ -256(SI), SI - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU 32(DI), X2 - MOVOU 48(DI), X3 - MOVOU 64(DI), X4 - MOVOU 80(DI), X5 - MOVOU 96(DI), X6 - MOVOU 112(DI), X7 - MOVOU 128(DI), X8 - MOVOU 144(DI), X9 - MOVOU 160(DI), X10 - MOVOU 176(DI), X11 - MOVOU 192(DI), X12 - MOVOU 208(DI), X13 - MOVOU 224(DI), X14 - MOVOU 240(DI), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (BX) MOVOU X1, 16(BX) MOVOU X2, 32(BX) @@ -7714,60 +7714,60 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: MOVOU X13, 208(BX) MOVOU X14, 224(BX) MOVOU X15, 240(BX) - CMPQ SI, $0x00000100 - LEAQ 256(DI), DI + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(BX), BX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned: - LEAQ (DI)(SI*1), R9 - MOVQ BX, R11 - MOVOU -128(R9), X5 - MOVOU -112(R9), X6 - MOVQ $0x00000080, R8 + LEAQ (R8)(DI*1), R10 + MOVQ BX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, BX ADDQ $0x20, BX - MOVOU -96(R9), X7 - MOVOU -80(R9), X8 - MOVQ BX, R10 - SUBQ R11, R10 - MOVOU -64(R9), X9 - MOVOU -48(R9), X10 - SUBQ R10, SI - MOVOU -32(R9), X11 - MOVOU -16(R9), X12 - VMOVDQU (DI), Y4 - ADDQ R10, DI - SUBQ R8, SI + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ BX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, DI + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (R8), Y4 + ADDQ R11, R8 + SUBQ R9, DI emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop: - VMOVDQU (DI), Y0 - VMOVDQU 32(DI), Y1 - VMOVDQU 64(DI), Y2 - VMOVDQU 96(DI), Y3 - ADDQ R8, DI + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU 64(R8), Y2 + VMOVDQU 96(R8), Y3 + ADDQ R9, R8 VMOVDQA Y0, (BX) VMOVDQA Y1, 32(BX) VMOVDQA Y2, 64(BX) VMOVDQA Y3, 96(BX) - ADDQ R8, BX - SUBQ R8, SI + ADDQ R9, BX + SUBQ R9, DI JA emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop - ADDQ R8, SI - ADDQ BX, SI - VMOVDQU Y4, (R11) + ADDQ R9, DI + ADDQ BX, DI + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(SI) - MOVOU X6, -112(SI) - MOVOU X7, -96(SI) - MOVOU X8, -80(SI) - MOVOU X9, -64(SI) - MOVOU X10, -48(SI) - MOVOU X11, -32(SI) - MOVOU X12, -16(SI) + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) JMP emit_literal_done_repeat_emit_encodeBlockAsm12BAvx - MOVQ R8, BX + MOVQ R9, BX emit_literal_done_repeat_emit_encodeBlockAsm12BAvx: MOVQ BX, dst_base+0(FP) @@ -7778,23 +7778,23 @@ emit_literal_skip_repeat_emit_encodeBlockAsm12BAvx: SUBL 24(SP), BX MOVL 16(SP), BX SUBL AX, BX - XORQ DI, DI + XORQ R8, R8 CMPQ BX, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_repeat_extend - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP repeat_extend_forward_end_encodeBlockAsm12BAvx matchlen_loop_repeat_extend: LEAQ -8(BX), BX - LEAQ 8(DI), DI + LEAQ 8(R8), R8 CMPQ BX, $0x08 JGE matchlen_loopback_repeat_extend @@ -7803,31 +7803,31 @@ matchlen_single_repeat_extend: JZ repeat_extend_forward_end_encodeBlockAsm12BAvx matchlen_single_loopback_repeat_extend: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE repeat_extend_forward_end_encodeBlockAsm12BAvx - LEAQ 1(DI), DI + LEAQ 1(R8), R8 DECQ BX JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm12BAvx: - ADDL DI, AX + ADDL R8, AX MOVL AX, BX - SUBL BP, BX - MOVL 24(SP), BP - MOVQ dst_base+0(FP), SI - MOVL 20(SP), DI - TESTL DI, DI + SUBL SI, BX + MOVL 24(SP), SI + MOVQ dst_base+0(FP), DI + MOVL 20(SP), R8 + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm12BAvx emit_repeat_again_match_repeat_: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_match_repeat_ - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_ - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_ cant_repeat_two_offset_match_repeat_: @@ -7838,74 +7838,74 @@ cant_repeat_two_offset_match_repeat_: CMPL BX, $0x0100ffff JLT repeat_five_match_repeat_ LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_repeat_ repeat_five_match_repeat_: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_four_match_repeat_: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_three_match_repeat_: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_match_repeat_: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_offset_match_repeat_: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_as_copy_encodeBlockAsm12BAvx: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm12BAvx CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx - MOVB $0xff, (SI) - MOVD BP, 1(SI) + MOVB $0xff, (DI) + MOVD SI, 1(DI) LEAQ -64(BX), BX - ADDQ $0x05, SI + ADDQ $0x05, DI CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: @@ -7916,52 +7916,52 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12BAvx four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx: @@ -7969,27 +7969,27 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx: JZ repeat_end_emit_encodeBlockAsm12BAvx MOVB $0x03, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + MOVB BL, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm12BAvx two_byte_offset_repeat_as_copy_encodeBlockAsm12BAvx: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12BAvx - MOVB $0xee, (SI) - MOVW BP, 1(SI) + MOVB $0xee, (DI) + MOVW SI, 1(DI) LEAQ -60(BX), BX - ADDQ $0x03, SI + ADDQ $0x03, DI emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - MOVQ BX, DI + MOVQ BX, R8 LEAQ -4(BX), BX - CMPL DI, $0x08 + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: @@ -8000,100 +8000,100 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short LEAQ -16842747(BX), BX - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: LEAQ -65536(BX), BX - MOVQ BX, BP - MOVW $0x001d, (SI) - MOVW BX, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + MOVQ BX, SI + MOVW $0x001d, (DI) + MOVW BX, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: LEAQ -256(BX), BX - MOVW $0x0019, (SI) - MOVW BX, 2(SI) - ADDQ $0x04, SI + MOVW $0x0019, (DI) + MOVW BX, 2(DI) + ADDQ $0x04, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: LEAQ -4(BX), BX - MOVW $0x0015, (SI) - MOVB BL, 2(SI) - ADDQ $0x03, SI + MOVW $0x0015, (DI) + MOVB BL, 2(DI) + ADDQ $0x03, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX - MOVW BX, (SI) - ADDQ $0x02, SI + MOVW BX, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - XORQ DI, DI - LEAQ 1(DI)(BX*4), BX - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + XORQ R8, R8 + LEAQ 1(R8)(BX*4), BX + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12BAvx two_byte_offset_short_repeat_as_copy_encodeBlockAsm12BAvx: CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx MOVB $0x01, DL LEAQ -16(DX)(BX*4), BX - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (SI) - ADDQ $0x02, SI + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BX + MOVB BL, (DI) + ADDQ $0x02, DI JMP repeat_end_emit_encodeBlockAsm12BAvx emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx: MOVB $0x02, DL LEAQ -4(DX)(BX*4), BX - MOVB BL, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + MOVB BL, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI repeat_end_emit_encodeBlockAsm12BAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL 16(SP), BX CMPL AX, BX JGT emit_remainder_encodeBlockAsm12BAvx JMP search_loop_encodeBlockAsm12BAvx no_repeat_found_encodeBlockAsm12BAvx: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ BP, DI - SHRQ $0x10, DI - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x34, DI - CMPL (CX)(BX*1), BP - SHRQ $0x08, BP + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ SI, R8 + SHRQ $0x10, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + CMPL (CX)(BX*1), SI + SHRQ $0x08, SI JEQ candidate_match_encodeBlockAsm12BAvx - MOVL 32(SP)(DI*1), BX - CMPL (CX)(SI*1), BP + MOVL 32(SP)(R8*1), BX + CMPL (CX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm12BAvx - LEAQ 2(AX), SI - MOVL SI, 32(SP)(DI*1) - SHRQ $0x08, BP - CMPL (CX)(BX*1), BP + LEAQ 2(AX), DI + MOVL DI, 32(SP)(R8*1) + SHRQ $0x08, SI + CMPL (CX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm12BAvx MOVL 28(SP), AX JMP search_loop_encodeBlockAsm12BAvx @@ -8104,21 +8104,21 @@ candidate3_match_encodeBlockAsm12BAvx: candidate2_match_encodeBlockAsm12BAvx: LEAQ -2(AX), BX - MOVL BX, 32(SP)(DI*1) + MOVL BX, 32(SP)(R8*1) INCL AX - MOVL SI, BX + MOVL DI, BX candidate_match_encodeBlockAsm12BAvx: - MOVL 20(SP), BP + MOVL 20(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm12BAvx match_extend_back_loop_encodeBlockAsm12BAvx: - CMPL AX, BP + CMPL AX, SI JG match_extend_back_end_encodeBlockAsm12BAvx MOVB -1(CX)(BX*1), DL - MOVB -1(CX)(AX*1), SI - CMPB DL, SI + MOVB -1(CX)(AX*1), DI + CMPB DL, DI JNE match_extend_back_end_encodeBlockAsm12BAvx LEAL -1(AX), AX DECL BX @@ -8126,555 +8126,555 @@ match_extend_back_loop_encodeBlockAsm12BAvx: JMP match_extend_back_loop_encodeBlockAsm12BAvx match_extend_back_end_encodeBlockAsm12BAvx: - MOVL AX, BP - SUBL 20(SP), BP - LEAQ dst_base+0(FP)(BP*1), BP - CMPQ BP, (SP) + MOVL AX, SI + SUBL 20(SP), SI + LEAQ dst_base+0(FP)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm12BAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm12BAvx: - MOVL BX, BP - MOVL 20(SP), SI - CMPL SI, BP + MOVL BX, SI + MOVL 20(SP), DI + CMPL DI, SI JEQ emit_literal_skip_match_emit_encodeBlockAsm12BAvx - MOVL BP, DI - MOVL BP, 20(SP) - LEAQ (CX)(SI*1), BP - SUBL SI, DI - MOVQ dst_base+0(FP), SI - MOVQ DI, R8 - SUBL $0x01, R8 + MOVL SI, R8 + MOVL SI, 20(SP) + LEAQ (CX)(DI*1), SI + SUBL DI, R8 + MOVQ dst_base+0(FP), DI + MOVQ R8, R9 + SUBL $0x01, R9 JC emit_literal_done_match_emit_encodeBlockAsm12BAvx - CMPL R8, $0x3c + CMPL R9, $0x3c JLT one_byte_match_emit_encodeBlockAsm12BAvx - CMPL R8, $0x00000100 + CMPL R9, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm12BAvx - CMPL R8, $0x00010000 + CMPL R9, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsm12BAvx - CMPL R8, $0x01000000 + CMPL R9, $0x01000000 JLT four_bytes_match_emit_encodeBlockAsm12BAvx - MOVB $0xfc, (SI) - MOVL R8, 1(SI) - ADDQ $0x05, SI + MOVB $0xfc, (DI) + MOVL R9, 1(DI) + ADDQ $0x05, DI JMP memmove_match_emit_encodeBlockAsm12BAvx four_bytes_match_emit_encodeBlockAsm12BAvx: - MOVQ R8, R9 - SHRL $0x10, R9 - MOVB $0xf8, (SI) - MOVW R8, 1(SI) - MOVB R9, 3(SI) - ADDQ $0x04, SI + MOVQ R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (DI) + MOVW R9, 1(DI) + MOVB R10, 3(DI) + ADDQ $0x04, DI JMP memmove_match_emit_encodeBlockAsm12BAvx three_bytes_match_emit_encodeBlockAsm12BAvx: - MOVB $0xf4, (SI) - MOVW R8, 1(SI) - ADDQ $0x03, SI + MOVB $0xf4, (DI) + MOVW R9, 1(DI) + ADDQ $0x03, DI JMP memmove_match_emit_encodeBlockAsm12BAvx two_bytes_match_emit_encodeBlockAsm12BAvx: - MOVB $0xf0, (SI) - MOVB R8, 1(SI) - ADDQ $0x02, SI + MOVB $0xf0, (DI) + MOVB R9, 1(DI) + ADDQ $0x02, DI JMP memmove_match_emit_encodeBlockAsm12BAvx one_byte_match_emit_encodeBlockAsm12BAvx: - SHLB $0x02, R8 - MOVB R8, (SI) - ADDQ $0x01, SI + SHLB $0x02, R9 + MOVB R9, (DI) + ADDQ $0x01, DI memmove_match_emit_encodeBlockAsm12BAvx: - LEAQ (SI)(DI*1), R8 + LEAQ (DI)(R8*1), R9 NOP emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_tail: - TESTQ DI, DI + TESTQ R8, R8 JEQ emit_literal_done_match_emit_encodeBlockAsm12BAvx - CMPQ DI, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_1or2 - CMPQ DI, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_3 JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_4 - CMPQ DI, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_5through7 JE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_8 - CMPQ DI, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_9through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_17through32 - CMPQ DI, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_33through64 - CMPQ DI, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_65through128 - CMPQ DI, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_129through256 JMP emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_1or2: - MOVB (BP), R8 - MOVB -1(BP)(DI*1), R9 - MOVB R8, (SI) - MOVB R9, -1(SI)(DI*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (DI) + MOVB R10, -1(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_4: - MOVL (BP), R8 - MOVL R8, (SI) + MOVL (SI), R9 + MOVL R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_3: - MOVW (BP), R8 - MOVB 2(BP), R9 - MOVW R8, (SI) - MOVB R9, 2(SI) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (DI) + MOVB R10, 2(DI) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_5through7: - MOVL (BP), R8 - MOVL -4(BP)(DI*1), R9 - MOVL R8, (SI) - MOVL R9, -4(SI)(DI*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (DI) + MOVL R10, -4(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_8: - MOVQ (BP), R8 - MOVQ R8, (SI) + MOVQ (SI), R9 + MOVQ R9, (DI) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_9through16: - MOVQ (BP), R8 - MOVQ -8(BP)(DI*1), R9 - MOVQ R8, (SI) - MOVQ R9, -8(SI)(DI*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (DI) + MOVQ R10, -8(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_17through32: - MOVOU (BP), X0 - MOVOU -16(BP)(DI*1), X1 - MOVOU X0, (SI) - MOVOU X1, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (DI) + MOVOU X1, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_33through64: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU -32(BP)(DI*1), X2 - MOVOU -16(BP)(DI*1), X3 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, -32(SI)(DI*1) - MOVOU X3, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, -32(DI)(R8*1) + MOVOU X3, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_65through128: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_129through256: - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU -128(BP)(DI*1), X8 - MOVOU -112(BP)(DI*1), X9 - MOVOU -96(BP)(DI*1), X10 - MOVOU -80(BP)(DI*1), X11 - MOVOU -64(BP)(DI*1), X12 - MOVOU -48(BP)(DI*1), X13 - MOVOU -32(BP)(DI*1), X14 - MOVOU -16(BP)(DI*1), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, -128(SI)(DI*1) - MOVOU X9, -112(SI)(DI*1) - MOVOU X10, -96(SI)(DI*1) - MOVOU X11, -80(SI)(DI*1) - MOVOU X12, -64(SI)(DI*1) - MOVOU X13, -48(SI)(DI*1) - MOVOU X14, -32(SI)(DI*1) - MOVOU X15, -16(SI)(DI*1) + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, -128(DI)(R8*1) + MOVOU X9, -112(DI)(R8*1) + MOVOU X10, -96(DI)(R8*1) + MOVOU X11, -80(DI)(R8*1) + MOVOU X12, -64(DI)(R8*1) + MOVOU X13, -48(DI)(R8*1) + MOVOU X14, -32(DI)(R8*1) + MOVOU X15, -16(DI)(R8*1) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: - LEAQ -256(DI), DI - MOVOU (BP), X0 - MOVOU 16(BP), X1 - MOVOU 32(BP), X2 - MOVOU 48(BP), X3 - MOVOU 64(BP), X4 - MOVOU 80(BP), X5 - MOVOU 96(BP), X6 - MOVOU 112(BP), X7 - MOVOU 128(BP), X8 - MOVOU 144(BP), X9 - MOVOU 160(BP), X10 - MOVOU 176(BP), X11 - MOVOU 192(BP), X12 - MOVOU 208(BP), X13 - MOVOU 224(BP), X14 - MOVOU 240(BP), X15 - MOVOU X0, (SI) - MOVOU X1, 16(SI) - MOVOU X2, 32(SI) - MOVOU X3, 48(SI) - MOVOU X4, 64(SI) - MOVOU X5, 80(SI) - MOVOU X6, 96(SI) - MOVOU X7, 112(SI) - MOVOU X8, 128(SI) - MOVOU X9, 144(SI) - MOVOU X10, 160(SI) - MOVOU X11, 176(SI) - MOVOU X12, 192(SI) - MOVOU X13, 208(SI) - MOVOU X14, 224(SI) - MOVOU X15, 240(SI) - CMPQ DI, $0x00000100 - LEAQ 256(BP), BP + LEAQ -256(R8), R8 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, 128(DI) + MOVOU X9, 144(DI) + MOVOU X10, 160(DI) + MOVOU X11, 176(DI) + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + CMPQ R8, $0x00000100 LEAQ 256(SI), SI + LEAQ 256(DI), DI JGE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_tail emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned: - LEAQ (BP)(DI*1), R9 - MOVQ SI, R11 - MOVOU -128(R9), X5 - MOVOU -112(R9), X6 - MOVQ $0x00000080, R8 - ANDQ $0xffffffe0, SI - ADDQ $0x20, SI - MOVOU -96(R9), X7 - MOVOU -80(R9), X8 - MOVQ SI, R10 - SUBQ R11, R10 - MOVOU -64(R9), X9 - MOVOU -48(R9), X10 - SUBQ R10, DI - MOVOU -32(R9), X11 - MOVOU -16(R9), X12 - VMOVDQU (BP), Y4 - ADDQ R10, BP - SUBQ R8, DI + LEAQ (SI)(R8*1), R10 + MOVQ DI, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 + ANDQ $0xffffffe0, DI + ADDQ $0x20, DI + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ DI, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop: - VMOVDQU (BP), Y0 - VMOVDQU 32(BP), Y1 - VMOVDQU 64(BP), Y2 - VMOVDQU 96(BP), Y3 - ADDQ R8, BP - VMOVDQA Y0, (SI) - VMOVDQA Y1, 32(SI) - VMOVDQA Y2, 64(SI) - VMOVDQA Y3, 96(SI) - ADDQ R8, SI - SUBQ R8, DI + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI + VMOVDQA Y0, (DI) + VMOVDQA Y1, 32(DI) + VMOVDQA Y2, 64(DI) + VMOVDQA Y3, 96(DI) + ADDQ R9, DI + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop - ADDQ R8, DI - ADDQ SI, DI - VMOVDQU Y4, (R11) + ADDQ R9, R8 + ADDQ DI, R8 + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(DI) - MOVOU X6, -112(DI) - MOVOU X7, -96(DI) - MOVOU X8, -80(DI) - MOVOU X9, -64(DI) - MOVOU X10, -48(DI) - MOVOU X11, -32(DI) - MOVOU X12, -16(DI) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) JMP emit_literal_done_match_emit_encodeBlockAsm12BAvx - MOVQ R8, SI + MOVQ R9, DI emit_literal_done_match_emit_encodeBlockAsm12BAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) emit_literal_skip_match_emit_encodeBlockAsm12BAvx: NOP match_nolit_loop_encodeBlockAsm12BAvx: - MOVL AX, BP - MOVL AX, BP - SUBL BX, BP - MOVL BP, 24(SP) + MOVL AX, SI + MOVL AX, SI + SUBL BX, SI + MOVL SI, 24(SP) ADDL $0x04, AX ADDL $0x04, BX - MOVL 16(SP), BP - SUBL AX, BP - XORQ DI, DI - CMPQ BP, $0x08 + MOVL 16(SP), SI + SUBL AX, SI + XORQ R8, R8 + CMPQ SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm12BAvx matchlen_loopback_match_nolit_encodeBlockAsm12BAvx: - MOVQ (CX)(DI*1), SI - XORQ (CX)(DI*1), SI - TESTQ SI, SI + MOVQ (CX)(R8*1), DI + XORQ (CX)(R8*1), DI + TESTQ DI, DI JZ matchlen_loop_match_nolit_encodeBlockAsm12BAvx - BSFQ SI, SI - SARQ $0x03, SI - LEAQ (DI)(SI*1), DI + BSFQ DI, DI + SARQ $0x03, DI + LEAQ (R8)(DI*1), R8 JMP match_nolit_end_encodeBlockAsm12BAvx matchlen_loop_match_nolit_encodeBlockAsm12BAvx: - LEAQ -8(BP), BP - LEAQ 8(DI), DI - CMPQ BP, $0x08 + LEAQ -8(SI), SI + LEAQ 8(R8), R8 + CMPQ SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm12BAvx matchlen_single_match_nolit_encodeBlockAsm12BAvx: - TESTQ BP, BP + TESTQ SI, SI JZ match_nolit_end_encodeBlockAsm12BAvx matchlen_single_loopback_match_nolit_encodeBlockAsm12BAvx: - MOVB (CX)(DI*1), SI - CMPB (CX)(DI*1), SI + MOVB (CX)(R8*1), DI + CMPB (CX)(R8*1), DI JNE match_nolit_end_encodeBlockAsm12BAvx - LEAQ 1(DI), DI - DECQ BP + LEAQ 1(R8), R8 + DECQ SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12BAvx match_nolit_end_encodeBlockAsm12BAvx: - MOVL 24(SP), BP - ADDQ $0x04, DI - MOVQ dst_base+0(FP), SI - ADDL DI, AX - CMPL BP, $0x00010000 + MOVL 24(SP), SI + ADDQ $0x04, R8 + MOVQ dst_base+0(FP), DI + ADDL R8, AX + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm12BAvx - CMPL DI, $0x40 + CMPL R8, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm12BAvx - MOVB $0xff, (SI) - MOVD BP, 1(SI) - LEAQ -64(DI), DI - ADDQ $0x05, SI - CMPL DI, $0x04 + MOVB $0xff, (DI) + MOVD SI, 1(DI) + LEAQ -64(R8), R8 + ADDQ $0x05, DI + CMPL R8, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm12BAvx emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx four_bytes_remain_match_nolit_encodeBlockAsm12BAvx: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_emitcopy_end_encodeBlockAsm12BAvx MOVB $0x03, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVD BP, 1(SI) - ADDQ $0x05, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVD SI, 1(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx two_byte_offset_match_nolit_encodeBlockAsm12BAvx: - CMPL DI, $0x40 + CMPL R8, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12BAvx - MOVB $0xee, (SI) - MOVW BP, 1(SI) - LEAQ -60(DI), DI - ADDQ $0x03, SI + MOVB $0xee, (DI) + MOVW SI, 1(DI) + LEAQ -60(R8), R8 + ADDQ $0x03, DI emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - MOVQ DI, R8 - LEAQ -4(DI), DI - CMPL R8, $0x08 + MOVQ R8, R9 + LEAQ -4(R8), R8 + CMPL R9, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - CMPL R8, $0x0c + CMPL R9, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - CMPL DI, $0x00000104 + CMPL R8, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - CMPL DI, $0x00010100 + CMPL R8, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - CMPL DI, $0x0100ffff + CMPL R8, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - LEAQ -16842747(DI), DI - MOVW $0x001d, (SI) - MOVW $0xfffb, 2(SI) - MOVB $0xff, 4(SI) - ADDQ $0x05, SI + LEAQ -16842747(R8), R8 + MOVW $0x001d, (DI) + MOVW $0xfffb, 2(DI) + MOVB $0xff, 4(DI) + ADDQ $0x05, DI JMP emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy_short repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - LEAQ -65536(DI), DI - MOVQ DI, BP - MOVW $0x001d, (SI) - MOVW DI, 2(SI) - SARQ $0x10, BP - MOVB BP, 4(SI) - ADDQ $0x05, SI + LEAQ -65536(R8), R8 + MOVQ R8, SI + MOVW $0x001d, (DI) + MOVW R8, 2(DI) + SARQ $0x10, SI + MOVB SI, 4(DI) + ADDQ $0x05, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - LEAQ -256(DI), DI - MOVW $0x0019, (SI) - MOVW DI, 2(SI) - ADDQ $0x04, SI + LEAQ -256(R8), R8 + MOVW $0x0019, (DI) + MOVW R8, 2(DI) + ADDQ $0x04, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - LEAQ -4(DI), DI - MOVW $0x0015, (SI) - MOVB DI, 2(SI) - ADDQ $0x03, SI + LEAQ -4(R8), R8 + MOVW $0x0015, (DI) + MOVB R8, 2(DI) + ADDQ $0x03, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - SHLL $0x02, DI - ORL $0x01, DI - MOVW DI, (SI) - ADDQ $0x02, SI + SHLL $0x02, R8 + ORL $0x01, R8 + MOVW R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - XORQ R8, R8 - LEAQ 1(R8)(DI*4), DI - MOVB BP, 1(SI) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + XORQ R9, R9 + LEAQ 1(R9)(R8*4), R8 + MOVB SI, 1(DI) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx two_byte_offset_short_match_nolit_encodeBlockAsm12BAvx: - CMPL DI, $0x0c + CMPL R8, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm12BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm12BAvx MOVB $0x01, DL - LEAQ -16(DX)(DI*4), DI - MOVB BP, 1(SI) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, DI - MOVB DI, (SI) - ADDQ $0x02, SI + LEAQ -16(DX)(R8*4), R8 + MOVB SI, 1(DI) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R8 + MOVB R8, (DI) + ADDQ $0x02, DI JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx emit_copy_three_match_nolit_encodeBlockAsm12BAvx: MOVB $0x02, DL - LEAQ -4(DX)(DI*4), DI - MOVB DI, (SI) - MOVW BP, 1(SI) - ADDQ $0x03, SI + LEAQ -4(DX)(R8*4), R8 + MOVB R8, (DI) + MOVW SI, 1(DI) + ADDQ $0x03, DI match_nolit_emitcopy_end_encodeBlockAsm12BAvx: - MOVQ SI, dst_base+0(FP) + MOVQ DI, dst_base+0(FP) MOVL AX, 20(SP) CMPL AX, 16(SP) JGE emit_remainder_encodeBlockAsm12BAvx - CMPQ SI, (SP) + CMPQ DI, (SP) JL match_nolit_dst_ok_encodeBlockAsm12BAvx MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm12BAvx: - MOVQ -2(CX)(AX*1), BP - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ BP, DI - SHRQ $0x10, BP - MOVQ BP, R8 - SHLQ $0x10, DI - IMULQ SI, DI - SHRQ $0x34, DI + MOVQ -2(CX)(AX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ SI, R8 + SHRQ $0x10, SI + MOVQ SI, R9 SHLQ $0x10, R8 - IMULQ SI, R8 + IMULQ DI, R8 SHRQ $0x34, R8 - MOVL 32(SP)(DI*1), SI - MOVL 32(SP)(R8*1), SI - LEAQ -2(AX), SI - MOVL SI, 32(SP)(DI*1) - MOVL AX, 32(SP)(R8*1) - CMPL (CX)(R8*1), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x34, R9 + MOVL 32(SP)(R8*1), DI + MOVL 32(SP)(R9*1), DI + LEAQ -2(AX), DI + MOVL DI, 32(SP)(R8*1) + MOVL AX, 32(SP)(R9*1) + CMPL (CX)(R9*1), SI JEQ match_nolit_loop_encodeBlockAsm12BAvx INCL AX JMP search_loop_encodeBlockAsm12BAvx @@ -8716,11 +8716,11 @@ emit_remainder_ok_encodeBlockAsm12BAvx: JMP memmove_emit_remainder_encodeBlockAsm12BAvx four_bytes_emit_remainder_encodeBlockAsm12BAvx: - MOVQ DX, BP - SHRL $0x10, BP + MOVQ DX, SI + SHRL $0x10, SI MOVB $0xf8, (CX) MOVW DX, 1(CX) - MOVB BP, 3(CX) + MOVB SI, 3(CX) ADDQ $0x04, CX JMP memmove_emit_remainder_encodeBlockAsm12BAvx @@ -8770,9 +8770,9 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_tail: emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_1or2: MOVB (AX), DL - MOVB -1(AX)(BX*1), BP + MOVB -1(AX)(BX*1), SI MOVB DL, (CX) - MOVB BP, -1(CX)(BX*1) + MOVB SI, -1(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_4: @@ -8782,16 +8782,16 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_4: emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_3: MOVW (AX), DX - MOVB 2(AX), BP + MOVB 2(AX), SI MOVW DX, (CX) - MOVB BP, 2(CX) + MOVB SI, 2(CX) JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_5through7: MOVL (AX), DX - MOVL -4(AX)(BX*1), BP + MOVL -4(AX)(BX*1), SI MOVL DX, (CX) - MOVL BP, -4(CX)(BX*1) + MOVL SI, -4(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_8: @@ -8801,9 +8801,9 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_8: emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_9through16: MOVQ (AX), DX - MOVQ -8(AX)(BX*1), BP + MOVQ -8(AX)(BX*1), SI MOVQ DX, (CX) - MOVQ BP, -8(CX)(BX*1) + MOVQ SI, -8(CX)(BX*1) JMP emit_literal_done_emit_remainder_encodeBlockAsm12BAvx emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_17through32: @@ -8919,24 +8919,24 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_move_256through2048 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_tail emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_avxUnaligned: - LEAQ (AX)(BX*1), BP - MOVQ CX, DI - MOVOU -128(BP), X5 - MOVOU -112(BP), X6 + LEAQ (AX)(BX*1), SI + MOVQ CX, R8 + MOVOU -128(SI), X5 + MOVOU -112(SI), X6 MOVQ $0x00000080, DX ANDQ $0xffffffe0, CX ADDQ $0x20, CX - MOVOU -96(BP), X7 - MOVOU -80(BP), X8 - MOVQ CX, SI - SUBQ DI, SI - MOVOU -64(BP), X9 - MOVOU -48(BP), X10 - SUBQ SI, BX - MOVOU -32(BP), X11 - MOVOU -16(BP), X12 + MOVOU -96(SI), X7 + MOVOU -80(SI), X8 + MOVQ CX, DI + SUBQ R8, DI + MOVOU -64(SI), X9 + MOVOU -48(SI), X10 + SUBQ DI, BX + MOVOU -32(SI), X11 + MOVOU -16(SI), X12 VMOVDQU (AX), Y4 - ADDQ SI, AX + ADDQ DI, AX SUBQ DX, BX emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_gobble_128_loop: @@ -8954,7 +8954,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_gobble_128_loop: JA emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_gobble_128_loop ADDQ DX, BX ADDQ CX, BX - VMOVDQU Y4, (DI) + VMOVDQU Y4, (R8) VZEROUPPER MOVOU X5, -128(BX) MOVOU X6, -112(BX) @@ -8978,55 +8978,55 @@ emit_literal_skip_emit_remainder_encodeBlockAsm12BAvx: // func emitLiteral(dst []byte, lit []byte) int // Requires: SSE2 -TEXT ·emitLiteral(SB), NOSPLIT, $8-56 +TEXT ·emitLiteral(SB), NOSPLIT, $0-56 MOVQ dst_base+0(FP), AX MOVQ lit_base+24(FP), CX MOVQ lit_len+32(FP), DX MOVQ DX, BX - MOVQ DX, BP - SUBL $0x01, BP + MOVQ DX, SI + SUBL $0x01, SI JC emit_literal_end_standalone - CMPL BP, $0x3c + CMPL SI, $0x3c JLT one_byte_standalone - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_standalone - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_standalone - CMPL BP, $0x01000000 + CMPL SI, $0x01000000 JLT four_bytes_standalone MOVB $0xfc, (AX) - MOVL BP, 1(AX) + MOVL SI, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP memmove_standalone four_bytes_standalone: - MOVQ BP, SI - SHRL $0x10, SI + MOVQ SI, DI + SHRL $0x10, DI MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB SI, 3(AX) + MOVW SI, 1(AX) + MOVB DI, 3(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP memmove_standalone three_bytes_standalone: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP memmove_standalone two_bytes_standalone: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP memmove_standalone one_byte_standalone: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, BX ADDQ $0x01, AX @@ -9057,40 +9057,40 @@ emit_lit_memmove_standalone_memmove_tail: JMP emit_lit_memmove_standalone_memmove_move_256through2048 emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), BP + MOVB (CX), SI MOVB -1(CX)(DX*1), CL - MOVB BP, (AX) + MOVB SI, (AX) MOVB CL, -1(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_4: - MOVL (CX), BP - MOVL BP, (AX) + MOVL (CX), SI + MOVL SI, (AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), BP + MOVW (CX), SI MOVB 2(CX), CL - MOVW BP, (AX) + MOVW SI, (AX) MOVB CL, 2(AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_5through7: - MOVL (CX), BP + MOVL (CX), SI MOVL -4(CX)(DX*1), CX - MOVL BP, (AX) + MOVL SI, (AX) MOVL CX, -4(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_8: - MOVQ (CX), BP - MOVQ BP, (AX) + MOVQ (CX), SI + MOVQ SI, (AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_9through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(DX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(DX*1) JMP emit_literal_end_standalone @@ -9212,55 +9212,55 @@ emit_literal_end_standalone: // func emitLiteralAvx(dst []byte, lit []byte) int // Requires: AVX, SSE2 -TEXT ·emitLiteralAvx(SB), NOSPLIT, $8-56 +TEXT ·emitLiteralAvx(SB), NOSPLIT, $0-56 MOVQ dst_base+0(FP), AX MOVQ lit_base+24(FP), CX MOVQ lit_len+32(FP), DX MOVQ DX, BX - MOVQ DX, BP - SUBL $0x01, BP + MOVQ DX, SI + SUBL $0x01, SI JC emit_literal_end_avx_standalone - CMPL BP, $0x3c + CMPL SI, $0x3c JLT one_byte_standalone - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_standalone - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_standalone - CMPL BP, $0x01000000 + CMPL SI, $0x01000000 JLT four_bytes_standalone MOVB $0xfc, (AX) - MOVL BP, 1(AX) + MOVL SI, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP memmove_standalone four_bytes_standalone: - MOVQ BP, SI - SHRL $0x10, SI + MOVQ SI, DI + SHRL $0x10, DI MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB SI, 3(AX) + MOVW SI, 1(AX) + MOVB DI, 3(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP memmove_standalone three_bytes_standalone: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP memmove_standalone two_bytes_standalone: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP memmove_standalone one_byte_standalone: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, BX ADDQ $0x01, AX @@ -9291,41 +9291,41 @@ emit_lit_memmove_standalone_memmove_tail: JMP emit_lit_memmove_standalone_memmove_avxUnaligned emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(DX*1), SI - MOVB BP, (AX) - MOVB SI, -1(AX)(DX*1) + MOVB (CX), SI + MOVB -1(CX)(DX*1), DI + MOVB SI, (AX) + MOVB DI, -1(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_4: - MOVL (CX), BP - MOVL BP, (AX) + MOVL (CX), SI + MOVL SI, (AX) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), SI - MOVW BP, (AX) - MOVB SI, 2(AX) + MOVW (CX), SI + MOVB 2(CX), DI + MOVW SI, (AX) + MOVB DI, 2(AX) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_5through7: - MOVL (CX), BP - MOVL -4(CX)(DX*1), SI - MOVL BP, (AX) - MOVL SI, -4(AX)(DX*1) + MOVL (CX), SI + MOVL -4(CX)(DX*1), DI + MOVL SI, (AX) + MOVL DI, -4(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_8: - MOVQ (CX), BP - MOVQ BP, (AX) + MOVQ (CX), SI + MOVQ SI, (AX) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_9through16: - MOVQ (CX), BP - MOVQ -8(CX)(DX*1), SI - MOVQ BP, (AX) - MOVQ SI, -8(AX)(DX*1) + MOVQ (CX), SI + MOVQ -8(CX)(DX*1), DI + MOVQ SI, (AX) + MOVQ DI, -8(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_17through32: @@ -9441,42 +9441,42 @@ emit_lit_memmove_standalone_memmove_move_256through2048: JMP emit_lit_memmove_standalone_memmove_tail emit_lit_memmove_standalone_memmove_avxUnaligned: - LEAQ (CX)(DX*1), SI - MOVQ AX, R8 - MOVOU -128(SI), X5 - MOVOU -112(SI), X6 - MOVQ $0x00000080, BP + LEAQ (CX)(DX*1), DI + MOVQ AX, R9 + MOVOU -128(DI), X5 + MOVOU -112(DI), X6 + MOVQ $0x00000080, SI ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(SI), X7 - MOVOU -80(SI), X8 - MOVQ AX, DI - SUBQ R8, DI - MOVOU -64(SI), X9 - MOVOU -48(SI), X10 - SUBQ DI, DX - MOVOU -32(SI), X11 - MOVOU -16(SI), X12 + MOVOU -96(DI), X7 + MOVOU -80(DI), X8 + MOVQ AX, R8 + SUBQ R9, R8 + MOVOU -64(DI), X9 + MOVOU -48(DI), X10 + SUBQ R8, DX + MOVOU -32(DI), X11 + MOVOU -16(DI), X12 VMOVDQU (CX), Y4 - ADDQ DI, CX - SUBQ BP, DX + ADDQ R8, CX + SUBQ SI, DX emit_lit_memmove_standalone_memmove_gobble_128_loop: VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 - ADDQ BP, CX + ADDQ SI, CX VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ BP, AX - SUBQ BP, DX + ADDQ SI, AX + SUBQ SI, DX JA emit_lit_memmove_standalone_memmove_gobble_128_loop - ADDQ BP, DX + ADDQ SI, DX ADDQ AX, DX - VMOVDQU Y4, (R8) + VMOVDQU Y4, (R9) VZEROUPPER MOVOU X5, -128(DX) MOVOU X6, -112(DX) @@ -9492,18 +9492,18 @@ emit_literal_end_avx_standalone: RET // func emitRepeat(dst []byte, offset int, length int) int -TEXT ·emitRepeat(SB), NOSPLIT, $8-48 +TEXT ·emitRepeat(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX emit_repeat_again_standalone: - MOVQ DX, BP + MOVQ DX, SI LEAQ -4(DX), DX - CMPL BP, $0x08 + CMPL SI, $0x08 JLE repeat_two_standalone - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone CMPL CX, $0x00000800 JLT repeat_two_offset_standalone @@ -9559,8 +9559,8 @@ repeat_two_standalone: JMP gen_emit_repeat_end repeat_two_offset_standalone: - XORQ BP, BP - LEAQ 1(BP)(DX*4), DX + XORQ SI, SI + LEAQ 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX @@ -9574,7 +9574,7 @@ gen_emit_repeat_end: RET // func emitCopy(dst []byte, offset int, length int) int -TEXT ·emitCopy(SB), NOSPLIT, $8-48 +TEXT ·emitCopy(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX @@ -9592,11 +9592,11 @@ TEXT ·emitCopy(SB), NOSPLIT, $8-48 JL four_bytes_remain_standalone emit_repeat_again_standalone_emit_copy: - MOVQ DX, BP + MOVQ DX, SI LEAQ -4(DX), DX - CMPL BP, $0x08 + CMPL SI, $0x08 JLE repeat_two_standalone_emit_copy - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy CMPL CX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy @@ -9652,8 +9652,8 @@ repeat_two_standalone_emit_copy: JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy: - XORQ BP, BP - LEAQ 1(BP)(DX*4), DX + XORQ SI, SI + LEAQ 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX @@ -9666,8 +9666,8 @@ repeat_two_offset_standalone_emit_copy: four_bytes_remain_standalone: TESTL DX, DX JZ gen_emit_copy_end - MOVB $0x03, BP - LEAQ -4(BP)(DX*4), DX + MOVB $0x03, SI + LEAQ -4(SI)(DX*4), DX MOVB DL, (AX) MOVD CX, 1(AX) ADDQ $0x05, BX @@ -9684,11 +9684,11 @@ two_byte_offset_standalone: ADDQ $0x03, BX emit_repeat_again_standalone_emit_copy_short: - MOVQ DX, BP + MOVQ DX, SI LEAQ -4(DX), DX - CMPL BP, $0x08 + CMPL SI, $0x08 JLE repeat_two_standalone_emit_copy_short - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy_short CMPL CX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy_short @@ -9744,8 +9744,8 @@ repeat_two_standalone_emit_copy_short: JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy_short: - XORQ BP, BP - LEAQ 1(BP)(DX*4), DX + XORQ SI, SI + LEAQ 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX @@ -9760,8 +9760,8 @@ two_byte_offset_short_standalone: JGE emit_copy_three_standalone CMPL CX, $0x00000800 JGE emit_copy_three_standalone - MOVB $0x01, BP - LEAQ -16(BP)(DX*4), DX + MOVB $0x01, SI + LEAQ -16(SI)(DX*4), DX MOVB CL, 1(AX) SHRL $0x08, CX SHLL $0x05, CX @@ -9772,8 +9772,8 @@ two_byte_offset_short_standalone: JMP gen_emit_copy_end emit_copy_three_standalone: - MOVB $0x02, BP - LEAQ -4(BP)(DX*4), DX + MOVB $0x02, SI + LEAQ -4(SI)(DX*4), DX MOVB DL, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX @@ -9784,27 +9784,27 @@ gen_emit_copy_end: RET // func matchLen(a []byte, b []byte) int -TEXT ·matchLen(SB), NOSPLIT, $8-56 +TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX MOVQ a_len+8(FP), DX - XORQ BP, BP + XORQ SI, SI CMPQ DX, $0x08 JL matchlen_single_standalone matchlen_loopback_standalone: - MOVQ (AX)(BP*1), BX - XORQ (CX)(BP*1), BX + MOVQ (AX)(SI*1), BX + XORQ (CX)(SI*1), BX TESTQ BX, BX JZ matchlen_loop_standalone BSFQ BX, BX SARQ $0x03, BX - LEAQ (BP)(BX*1), BP + LEAQ (SI)(BX*1), SI JMP gen_match_len_end matchlen_loop_standalone: LEAQ -8(DX), DX - LEAQ 8(BP), BP + LEAQ 8(SI), SI CMPQ DX, $0x08 JGE matchlen_loopback_standalone @@ -9813,13 +9813,13 @@ matchlen_single_standalone: JZ gen_match_len_end matchlen_single_loopback_standalone: - MOVB (AX)(BP*1), BL - CMPB (CX)(BP*1), BL + MOVB (AX)(SI*1), BL + CMPB (CX)(SI*1), BL JNE gen_match_len_end - LEAQ 1(BP), BP + LEAQ 1(SI), SI DECQ DX JNZ matchlen_single_loopback_standalone gen_match_len_end: - MOVQ BP, ret+48(FP) + MOVQ SI, ret+48(FP) RET