2426 lines
42 KiB
ArmAsm
2426 lines
42 KiB
ArmAsm
// Code generated by command: go run p256_asm.go -out ../p256_asm_amd64.s. DO NOT EDIT.
|
|
|
|
//go:build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
// func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int)
|
|
// Requires: SSE2
|
|
TEXT ·p256MovCond(SB), NOSPLIT, $0-32
|
|
MOVQ res+0(FP), DI
|
|
MOVQ a+8(FP), SI
|
|
MOVQ b+16(FP), CX
|
|
MOVQ cond+24(FP), X12
|
|
PXOR X13, X13
|
|
PSHUFD $0x00, X12, X12
|
|
PCMPEQL X13, X12
|
|
MOVOU X12, X0
|
|
MOVOU (SI), X6
|
|
PANDN X6, X0
|
|
MOVOU X12, X1
|
|
MOVOU 16(SI), X7
|
|
PANDN X7, X1
|
|
MOVOU X12, X2
|
|
MOVOU 32(SI), X8
|
|
PANDN X8, X2
|
|
MOVOU X12, X3
|
|
MOVOU 48(SI), X9
|
|
PANDN X9, X3
|
|
MOVOU X12, X4
|
|
MOVOU 64(SI), X10
|
|
PANDN X10, X4
|
|
MOVOU X12, X5
|
|
MOVOU 80(SI), X11
|
|
PANDN X11, X5
|
|
MOVOU (CX), X6
|
|
MOVOU 16(CX), X7
|
|
MOVOU 32(CX), X8
|
|
MOVOU 48(CX), X9
|
|
MOVOU 64(CX), X10
|
|
MOVOU 80(CX), X11
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
MOVOU X0, (DI)
|
|
MOVOU X1, 16(DI)
|
|
MOVOU X2, 32(DI)
|
|
MOVOU X3, 48(DI)
|
|
MOVOU X4, 64(DI)
|
|
MOVOU X5, 80(DI)
|
|
RET
|
|
|
|
// func p256NegCond(val *p256Element, cond int)
|
|
// Requires: CMOV
|
|
TEXT ·p256NegCond(SB), NOSPLIT, $0-16
|
|
MOVQ val+0(FP), DI
|
|
MOVQ cond+8(FP), R14
|
|
|
|
// acc = poly
|
|
MOVQ $-1, R8
|
|
MOVQ p256const0<>+0(SB), R9
|
|
MOVQ $+0, R10
|
|
MOVQ p256const1<>+0(SB), R11
|
|
|
|
// Load the original value
|
|
MOVQ (DI), R13
|
|
MOVQ 8(DI), SI
|
|
MOVQ 16(DI), CX
|
|
MOVQ 24(DI), R15
|
|
|
|
// Speculatively subtract
|
|
SUBQ R13, R8
|
|
SBBQ SI, R9
|
|
SBBQ CX, R10
|
|
SBBQ R15, R11
|
|
|
|
// If condition is 0, keep original value
|
|
TESTQ R14, R14
|
|
CMOVQEQ R13, R8
|
|
CMOVQEQ SI, R9
|
|
CMOVQEQ CX, R10
|
|
CMOVQEQ R15, R11
|
|
|
|
// Store result
|
|
MOVQ R8, (DI)
|
|
MOVQ R9, 8(DI)
|
|
MOVQ R10, 16(DI)
|
|
MOVQ R11, 24(DI)
|
|
RET
|
|
|
|
DATA p256const0<>+0(SB)/8, $0x00000000ffffffff
|
|
GLOBL p256const0<>(SB), RODATA, $8
|
|
|
|
DATA p256const1<>+0(SB)/8, $0xffffffff00000001
|
|
GLOBL p256const1<>(SB), RODATA, $8
|
|
|
|
// func p256Sqr(res *p256Element, in *p256Element, n int)
|
|
// Requires: CMOV
|
|
TEXT ·p256Sqr(SB), NOSPLIT, $0-24
|
|
MOVQ res+0(FP), DI
|
|
MOVQ in+8(FP), SI
|
|
MOVQ n+16(FP), BX
|
|
|
|
sqrLoop:
|
|
// y[1:] * y[0]
|
|
MOVQ (SI), R14
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
MOVQ AX, R9
|
|
MOVQ DX, R10
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R11
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R12
|
|
|
|
// y[2:] * y[1]
|
|
MOVQ 8(SI), R14
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R13
|
|
|
|
// y[3] * y[2]
|
|
MOVQ 16(SI), R14
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R13
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, CX
|
|
XORQ R15, R15
|
|
|
|
// *2
|
|
ADDQ R9, R9
|
|
ADCQ R10, R10
|
|
ADCQ R11, R11
|
|
ADCQ R12, R12
|
|
ADCQ R13, R13
|
|
ADCQ CX, CX
|
|
ADCQ $0x00, R15
|
|
|
|
// Missing products
|
|
MOVQ (SI), AX
|
|
MULQ AX
|
|
MOVQ AX, R8
|
|
MOVQ DX, R14
|
|
MOVQ 8(SI), AX
|
|
MULQ AX
|
|
ADDQ R14, R9
|
|
ADCQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R14
|
|
MOVQ 16(SI), AX
|
|
MULQ AX
|
|
ADDQ R14, R11
|
|
ADCQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R14
|
|
MOVQ 24(SI), AX
|
|
MULQ AX
|
|
ADDQ R14, R13
|
|
ADCQ AX, CX
|
|
ADCQ DX, R15
|
|
MOVQ R15, SI
|
|
|
|
// First reduction step
|
|
MOVQ R8, AX
|
|
MOVQ R8, R15
|
|
SHLQ $0x20, R8
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R8, R9
|
|
ADCQ R15, R10
|
|
ADCQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R8
|
|
|
|
// Second reduction step
|
|
MOVQ R9, AX
|
|
MOVQ R9, R15
|
|
SHLQ $0x20, R9
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R9, R10
|
|
ADCQ R15, R11
|
|
ADCQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R9
|
|
|
|
// Third reduction step
|
|
MOVQ R10, AX
|
|
MOVQ R10, R15
|
|
SHLQ $0x20, R10
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R10, R11
|
|
ADCQ R15, R8
|
|
ADCQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
|
|
// Last reduction step
|
|
XORQ R14, R14
|
|
MOVQ R11, AX
|
|
MOVQ R11, R15
|
|
SHLQ $0x20, R11
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R11, R8
|
|
ADCQ R15, R9
|
|
ADCQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R11
|
|
|
|
// Add bits [511:256] of the sqr result
|
|
ADCQ R12, R8
|
|
ADCQ R13, R9
|
|
ADCQ CX, R10
|
|
ADCQ SI, R11
|
|
ADCQ $0x00, R14
|
|
MOVQ R8, R12
|
|
MOVQ R9, R13
|
|
MOVQ R10, CX
|
|
MOVQ R11, R15
|
|
|
|
// Subtract p256
|
|
SUBQ $-1, R8
|
|
SBBQ p256const0<>+0(SB), R9
|
|
SBBQ $0x00, R10
|
|
SBBQ p256const1<>+0(SB), R11
|
|
SBBQ $0x00, R14
|
|
CMOVQCS R12, R8
|
|
CMOVQCS R13, R9
|
|
CMOVQCS CX, R10
|
|
CMOVQCS R15, R11
|
|
MOVQ R8, (DI)
|
|
MOVQ R9, 8(DI)
|
|
MOVQ R10, 16(DI)
|
|
MOVQ R11, 24(DI)
|
|
MOVQ DI, SI
|
|
DECQ BX
|
|
JNE sqrLoop
|
|
RET
|
|
|
|
// func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element)
|
|
// Requires: CMOV
|
|
TEXT ·p256Mul(SB), NOSPLIT, $0-24
|
|
MOVQ res+0(FP), DI
|
|
MOVQ in1+8(FP), SI
|
|
MOVQ in2+16(FP), CX
|
|
|
|
// x * y[0]
|
|
MOVQ (CX), R14
|
|
MOVQ (SI), AX
|
|
MULQ R14
|
|
MOVQ AX, R8
|
|
MOVQ DX, R9
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R11
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R12
|
|
XORQ R13, R13
|
|
|
|
// First reduction step
|
|
MOVQ R8, AX
|
|
MOVQ R8, R15
|
|
SHLQ $0x20, R8
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R8, R9
|
|
ADCQ R15, R10
|
|
ADCQ AX, R11
|
|
ADCQ DX, R12
|
|
ADCQ $0x00, R13
|
|
XORQ R8, R8
|
|
|
|
// x * y[1]
|
|
MOVQ 8(CX), R14
|
|
MOVQ (SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R10
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ DX, R13
|
|
ADCQ $0x00, R8
|
|
|
|
// Second reduction step
|
|
MOVQ R9, AX
|
|
MOVQ R9, R15
|
|
SHLQ $0x20, R9
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R9, R10
|
|
ADCQ R15, R11
|
|
ADCQ AX, R12
|
|
ADCQ DX, R13
|
|
ADCQ $0x00, R8
|
|
XORQ R9, R9
|
|
|
|
// x * y[2]
|
|
MOVQ 16(CX), R14
|
|
MOVQ (SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R13
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R13
|
|
ADCQ DX, R8
|
|
ADCQ $0x00, R9
|
|
|
|
// Third reduction step
|
|
MOVQ R10, AX
|
|
MOVQ R10, R15
|
|
SHLQ $0x20, R10
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R10, R11
|
|
ADCQ R15, R12
|
|
ADCQ AX, R13
|
|
ADCQ DX, R8
|
|
ADCQ $0x00, R9
|
|
XORQ R10, R10
|
|
|
|
// x * y[3]
|
|
MOVQ 24(CX), R14
|
|
MOVQ (SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R13
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R13
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R8
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
ADCQ $0x00, R10
|
|
|
|
// Last reduction step
|
|
MOVQ R11, AX
|
|
MOVQ R11, R15
|
|
SHLQ $0x20, R11
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R11, R12
|
|
ADCQ R15, R13
|
|
ADCQ AX, R8
|
|
ADCQ DX, R9
|
|
ADCQ $0x00, R10
|
|
|
|
// Copy result [255:0]
|
|
MOVQ R12, SI
|
|
MOVQ R13, R11
|
|
MOVQ R8, R14
|
|
MOVQ R9, R15
|
|
|
|
// Subtract p256
|
|
SUBQ $-1, R12
|
|
SBBQ p256const0<>+0(SB), R13
|
|
SBBQ $0x00, R8
|
|
SBBQ p256const1<>+0(SB), R9
|
|
SBBQ $0x00, R10
|
|
CMOVQCS SI, R12
|
|
CMOVQCS R11, R13
|
|
CMOVQCS R14, R8
|
|
CMOVQCS R15, R9
|
|
MOVQ R12, (DI)
|
|
MOVQ R13, 8(DI)
|
|
MOVQ R8, 16(DI)
|
|
MOVQ R9, 24(DI)
|
|
RET
|
|
|
|
// func p256FromMont(res *p256Element, in *p256Element)
|
|
// Requires: CMOV
|
|
TEXT ·p256FromMont(SB), NOSPLIT, $0-16
|
|
MOVQ res+0(FP), DI
|
|
MOVQ in+8(FP), SI
|
|
MOVQ (SI), R8
|
|
MOVQ 8(SI), R9
|
|
MOVQ 16(SI), R10
|
|
MOVQ 24(SI), R11
|
|
XORQ R12, R12
|
|
|
|
// Only reduce, no multiplications are needed
|
|
// First stage
|
|
MOVQ R8, AX
|
|
MOVQ R8, R15
|
|
SHLQ $0x20, R8
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R8, R9
|
|
ADCQ R15, R10
|
|
ADCQ AX, R11
|
|
ADCQ DX, R12
|
|
XORQ R13, R13
|
|
|
|
// Second stage
|
|
MOVQ R9, AX
|
|
MOVQ R9, R15
|
|
SHLQ $0x20, R9
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R9, R10
|
|
ADCQ R15, R11
|
|
ADCQ AX, R12
|
|
ADCQ DX, R13
|
|
XORQ R8, R8
|
|
|
|
// Third stage
|
|
MOVQ R10, AX
|
|
MOVQ R10, R15
|
|
SHLQ $0x20, R10
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R10, R11
|
|
ADCQ R15, R12
|
|
ADCQ AX, R13
|
|
ADCQ DX, R8
|
|
XORQ R9, R9
|
|
|
|
// Last stage
|
|
MOVQ R11, AX
|
|
MOVQ R11, R15
|
|
SHLQ $0x20, R11
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, R15
|
|
ADDQ R11, R12
|
|
ADCQ R15, R13
|
|
ADCQ AX, R8
|
|
ADCQ DX, R9
|
|
MOVQ R12, SI
|
|
MOVQ R13, R11
|
|
MOVQ R8, R14
|
|
MOVQ R9, R15
|
|
SUBQ $-1, R12
|
|
SBBQ p256const0<>+0(SB), R13
|
|
SBBQ $0x00, R8
|
|
SBBQ p256const1<>+0(SB), R9
|
|
CMOVQCS SI, R12
|
|
CMOVQCS R11, R13
|
|
CMOVQCS R14, R8
|
|
CMOVQCS R15, R9
|
|
MOVQ R12, (DI)
|
|
MOVQ R13, 8(DI)
|
|
MOVQ R8, 16(DI)
|
|
MOVQ R9, 24(DI)
|
|
RET
|
|
|
|
// func p256Select(res *P256Point, table *p256Table, idx int)
|
|
// Requires: SSE2
|
|
TEXT ·p256Select(SB), NOSPLIT, $0-24
|
|
MOVQ idx+16(FP), AX
|
|
MOVQ table+8(FP), DI
|
|
MOVQ res+0(FP), DX
|
|
PXOR X15, X15
|
|
PCMPEQL X14, X14
|
|
PSUBL X14, X15
|
|
MOVL AX, X14
|
|
PSHUFD $0x00, X14, X14
|
|
PXOR X0, X0
|
|
PXOR X1, X1
|
|
PXOR X2, X2
|
|
PXOR X3, X3
|
|
PXOR X4, X4
|
|
PXOR X5, X5
|
|
MOVQ $0x00000010, AX
|
|
MOVOU X15, X13
|
|
|
|
loop_select:
|
|
MOVOU X13, X12
|
|
PADDL X15, X13
|
|
PCMPEQL X14, X12
|
|
MOVOU (DI), X6
|
|
MOVOU 16(DI), X7
|
|
MOVOU 32(DI), X8
|
|
MOVOU 48(DI), X9
|
|
MOVOU 64(DI), X10
|
|
MOVOU 80(DI), X11
|
|
ADDQ $0x60, DI
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
PXOR X6, X0
|
|
PXOR X7, X1
|
|
PXOR X8, X2
|
|
PXOR X9, X3
|
|
PXOR X10, X4
|
|
PXOR X11, X5
|
|
DECQ AX
|
|
JNE loop_select
|
|
MOVOU X0, (DX)
|
|
MOVOU X1, 16(DX)
|
|
MOVOU X2, 32(DX)
|
|
MOVOU X3, 48(DX)
|
|
MOVOU X4, 64(DX)
|
|
MOVOU X5, 80(DX)
|
|
RET
|
|
|
|
// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
|
|
// Requires: SSE2
|
|
TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
|
|
MOVQ idx+16(FP), AX
|
|
MOVQ table+8(FP), DI
|
|
MOVQ res+0(FP), DX
|
|
PXOR X15, X15
|
|
PCMPEQL X14, X14
|
|
PSUBL X14, X15
|
|
MOVL AX, X14
|
|
PSHUFD $0x00, X14, X14
|
|
PXOR X0, X0
|
|
PXOR X1, X1
|
|
PXOR X2, X2
|
|
PXOR X3, X3
|
|
MOVQ $0x00000010, AX
|
|
MOVOU X15, X13
|
|
|
|
loop_select_base:
|
|
MOVOU X13, X12
|
|
PADDL X15, X13
|
|
PCMPEQL X14, X12
|
|
MOVOU (DI), X4
|
|
MOVOU 16(DI), X5
|
|
MOVOU 32(DI), X6
|
|
MOVOU 48(DI), X7
|
|
MOVOU 64(DI), X8
|
|
MOVOU 80(DI), X9
|
|
MOVOU 96(DI), X10
|
|
MOVOU 112(DI), X11
|
|
ADDQ $0x80, DI
|
|
PAND X12, X4
|
|
PAND X12, X5
|
|
PAND X12, X6
|
|
PAND X12, X7
|
|
MOVOU X13, X12
|
|
PADDL X15, X13
|
|
PCMPEQL X14, X12
|
|
PAND X12, X8
|
|
PAND X12, X9
|
|
PAND X12, X10
|
|
PAND X12, X11
|
|
PXOR X4, X0
|
|
PXOR X5, X1
|
|
PXOR X6, X2
|
|
PXOR X7, X3
|
|
PXOR X8, X0
|
|
PXOR X9, X1
|
|
PXOR X10, X2
|
|
PXOR X11, X3
|
|
DECQ AX
|
|
JNE loop_select_base
|
|
MOVOU X0, (DX)
|
|
MOVOU X1, 16(DX)
|
|
MOVOU X2, 32(DX)
|
|
MOVOU X3, 48(DX)
|
|
RET
|
|
|
|
// func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
|
|
// Requires: CMOV
|
|
TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
|
|
MOVQ res+0(FP), DI
|
|
MOVQ in1+8(FP), SI
|
|
MOVQ in2+16(FP), CX
|
|
|
|
// x * y[0]
|
|
MOVQ (CX), R14
|
|
MOVQ (SI), AX
|
|
MULQ R14
|
|
MOVQ AX, R8
|
|
MOVQ DX, R9
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R11
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R12
|
|
XORQ R13, R13
|
|
|
|
// First reduction step
|
|
MOVQ R8, AX
|
|
MULQ p256ordK0<>+0(SB)
|
|
MOVQ AX, R14
|
|
MOVQ p256ord<>+0(SB), AX
|
|
MULQ R14
|
|
ADDQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+8(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R9
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+16(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R10
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+24(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ DX, R12
|
|
ADCQ $0x00, R13
|
|
|
|
// x * y[1]
|
|
MOVQ 8(CX), R14
|
|
MOVQ (SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R10
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ DX, R13
|
|
ADCQ $0x00, R8
|
|
|
|
// Second reduction step
|
|
MOVQ R9, AX
|
|
MULQ p256ordK0<>+0(SB)
|
|
MOVQ AX, R14
|
|
MOVQ p256ord<>+0(SB), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+8(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R10
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+16(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+24(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ DX, R13
|
|
ADCQ $0x00, R8
|
|
|
|
// x * y[2]
|
|
MOVQ 16(CX), R14
|
|
MOVQ (SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R13
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R13
|
|
ADCQ DX, R8
|
|
ADCQ $0x00, R9
|
|
|
|
// Third reduction step
|
|
MOVQ R10, AX
|
|
MULQ p256ordK0<>+0(SB)
|
|
MOVQ AX, R14
|
|
MOVQ p256ord<>+0(SB), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+8(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+16(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+24(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R13
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R13
|
|
ADCQ DX, R8
|
|
ADCQ $0x00, R9
|
|
|
|
// x * y[3]
|
|
MOVQ 24(CX), R14
|
|
MOVQ (SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R13
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R13
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R8
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
ADCQ $0x00, R10
|
|
|
|
// Last reduction step
|
|
MOVQ R11, AX
|
|
MULQ p256ordK0<>+0(SB)
|
|
MOVQ AX, R14
|
|
MOVQ p256ord<>+0(SB), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+8(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+16(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R13
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R13
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+24(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R8
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
ADCQ $0x00, R10
|
|
|
|
// Copy result [255:0]
|
|
MOVQ R12, SI
|
|
MOVQ R13, R11
|
|
MOVQ R8, R14
|
|
MOVQ R9, R15
|
|
|
|
// Subtract p256
|
|
SUBQ p256ord<>+0(SB), R12
|
|
SBBQ p256ord<>+8(SB), R13
|
|
SBBQ p256ord<>+16(SB), R8
|
|
SBBQ p256ord<>+24(SB), R9
|
|
SBBQ $0x00, R10
|
|
CMOVQCS SI, R12
|
|
CMOVQCS R11, R13
|
|
CMOVQCS R14, R8
|
|
CMOVQCS R15, R9
|
|
MOVQ R12, (DI)
|
|
MOVQ R13, 8(DI)
|
|
MOVQ R8, 16(DI)
|
|
MOVQ R9, 24(DI)
|
|
RET
|
|
|
|
DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
|
|
GLOBL p256ordK0<>(SB), RODATA, $8
|
|
|
|
DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
|
|
DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
|
|
DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
|
|
DATA p256ord<>+24(SB)/8, $0xffffffff00000000
|
|
GLOBL p256ord<>(SB), RODATA, $32
|
|
|
|
// func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
|
|
// Requires: CMOV
|
|
TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
|
|
MOVQ res+0(FP), DI
|
|
MOVQ in+8(FP), SI
|
|
MOVQ n+16(FP), BX
|
|
|
|
ordSqrLoop:
|
|
// y[1:] * y[0]
|
|
MOVQ (SI), R14
|
|
MOVQ 8(SI), AX
|
|
MULQ R14
|
|
MOVQ AX, R9
|
|
MOVQ DX, R10
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R11
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R12
|
|
|
|
// y[2:] * y[1]
|
|
MOVQ 8(SI), R14
|
|
MOVQ 16(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ R15, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R13
|
|
|
|
// y[3] * y[2]
|
|
MOVQ 16(SI), R14
|
|
MOVQ 24(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R13
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, CX
|
|
XORQ R15, R15
|
|
|
|
// *2
|
|
ADDQ R9, R9
|
|
ADCQ R10, R10
|
|
ADCQ R11, R11
|
|
ADCQ R12, R12
|
|
ADCQ R13, R13
|
|
ADCQ CX, CX
|
|
ADCQ $0x00, R15
|
|
|
|
// Missing products
|
|
MOVQ (SI), AX
|
|
MULQ AX
|
|
MOVQ AX, R8
|
|
MOVQ DX, R14
|
|
MOVQ 8(SI), AX
|
|
MULQ AX
|
|
ADDQ R14, R9
|
|
ADCQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R14
|
|
MOVQ 16(SI), AX
|
|
MULQ AX
|
|
ADDQ R14, R11
|
|
ADCQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R14
|
|
MOVQ 24(SI), AX
|
|
MULQ AX
|
|
ADDQ R14, R13
|
|
ADCQ AX, CX
|
|
ADCQ DX, R15
|
|
MOVQ R15, SI
|
|
|
|
// First reduction step
|
|
MOVQ R8, AX
|
|
MULQ p256ordK0<>+0(SB)
|
|
MOVQ AX, R14
|
|
MOVQ p256ord<>+0(SB), AX
|
|
MULQ R14
|
|
ADDQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+8(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R9
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R9
|
|
MOVQ R14, R15
|
|
ADCQ DX, R10
|
|
ADCQ $0x00, R15
|
|
SUBQ R14, R10
|
|
SBBQ $0x00, R15
|
|
MOVQ R14, AX
|
|
MOVQ R14, DX
|
|
MOVQ R14, R8
|
|
SHLQ $0x20, AX
|
|
SHRQ $0x20, DX
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, R8
|
|
SUBQ AX, R11
|
|
SBBQ DX, R8
|
|
|
|
// Second reduction step
|
|
MOVQ R9, AX
|
|
MULQ p256ordK0<>+0(SB)
|
|
MOVQ AX, R14
|
|
MOVQ p256ord<>+0(SB), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+8(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R10
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R10
|
|
MOVQ R14, R15
|
|
ADCQ DX, R11
|
|
ADCQ $0x00, R15
|
|
SUBQ R14, R11
|
|
SBBQ $0x00, R15
|
|
MOVQ R14, AX
|
|
MOVQ R14, DX
|
|
MOVQ R14, R9
|
|
SHLQ $0x20, AX
|
|
SHRQ $0x20, DX
|
|
ADDQ R15, R8
|
|
ADCQ $0x00, R9
|
|
SUBQ AX, R8
|
|
SBBQ DX, R9
|
|
|
|
// Third reduction step
|
|
MOVQ R10, AX
|
|
MULQ p256ordK0<>+0(SB)
|
|
MOVQ AX, R14
|
|
MOVQ p256ord<>+0(SB), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+8(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
MOVQ R14, R15
|
|
ADCQ DX, R8
|
|
ADCQ $0x00, R15
|
|
SUBQ R14, R8
|
|
SBBQ $0x00, R15
|
|
MOVQ R14, AX
|
|
MOVQ R14, DX
|
|
MOVQ R14, R10
|
|
SHLQ $0x20, AX
|
|
SHRQ $0x20, DX
|
|
ADDQ R15, R9
|
|
ADCQ $0x00, R10
|
|
SUBQ AX, R9
|
|
SBBQ DX, R10
|
|
|
|
// Last reduction step
|
|
MOVQ R11, AX
|
|
MULQ p256ordK0<>+0(SB)
|
|
MOVQ AX, R14
|
|
MOVQ p256ord<>+0(SB), AX
|
|
MULQ R14
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ p256ord<>+8(SB), AX
|
|
MULQ R14
|
|
ADDQ R15, R8
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ R14, R15
|
|
ADCQ DX, R9
|
|
ADCQ $0x00, R15
|
|
SUBQ R14, R9
|
|
SBBQ $0x00, R15
|
|
MOVQ R14, AX
|
|
MOVQ R14, DX
|
|
MOVQ R14, R11
|
|
SHLQ $0x20, AX
|
|
SHRQ $0x20, DX
|
|
ADDQ R15, R10
|
|
ADCQ $0x00, R11
|
|
SUBQ AX, R10
|
|
SBBQ DX, R11
|
|
XORQ R14, R14
|
|
|
|
// Add bits [511:256] of the sqr result
|
|
ADCQ R12, R8
|
|
ADCQ R13, R9
|
|
ADCQ CX, R10
|
|
ADCQ SI, R11
|
|
ADCQ $0x00, R14
|
|
MOVQ R8, R12
|
|
MOVQ R9, R13
|
|
MOVQ R10, CX
|
|
MOVQ R11, R15
|
|
|
|
// Subtract p256
|
|
SUBQ p256ord<>+0(SB), R8
|
|
SBBQ p256ord<>+8(SB), R9
|
|
SBBQ p256ord<>+16(SB), R10
|
|
SBBQ p256ord<>+24(SB), R11
|
|
SBBQ $0x00, R14
|
|
CMOVQCS R12, R8
|
|
CMOVQCS R13, R9
|
|
CMOVQCS CX, R10
|
|
CMOVQCS R15, R11
|
|
MOVQ R8, (DI)
|
|
MOVQ R9, 8(DI)
|
|
MOVQ R10, 16(DI)
|
|
MOVQ R11, 24(DI)
|
|
MOVQ DI, SI
|
|
DECQ BX
|
|
JNE ordSqrLoop
|
|
RET
|
|
|
|
// func p256SubInternal()
|
|
// Requires: CMOV
|
|
TEXT p256SubInternal<>(SB), NOSPLIT, $0
|
|
XORQ AX, AX
|
|
SUBQ R14, R10
|
|
SBBQ R15, R11
|
|
SBBQ DI, R12
|
|
SBBQ SI, R13
|
|
SBBQ $0x00, AX
|
|
MOVQ R10, BX
|
|
MOVQ R11, CX
|
|
MOVQ R12, R8
|
|
MOVQ R13, R9
|
|
ADDQ $-1, R10
|
|
ADCQ p256const0<>+0(SB), R11
|
|
ADCQ $0x00, R12
|
|
ADCQ p256const1<>+0(SB), R13
|
|
ANDQ $0x01, AX
|
|
CMOVQEQ BX, R10
|
|
CMOVQEQ CX, R11
|
|
CMOVQEQ R8, R12
|
|
CMOVQEQ R9, R13
|
|
RET
|
|
|
|
// func p256MulInternal()
|
|
// Requires: CMOV
|
|
TEXT p256MulInternal<>(SB), NOSPLIT, $8
|
|
MOVQ R10, AX
|
|
MULQ R14
|
|
MOVQ AX, BX
|
|
MOVQ DX, CX
|
|
MOVQ R10, AX
|
|
MULQ R15
|
|
ADDQ AX, CX
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R8
|
|
MOVQ R10, AX
|
|
MULQ DI
|
|
ADDQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R9
|
|
MOVQ R10, AX
|
|
MULQ SI
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ R11, AX
|
|
MULQ R14
|
|
ADDQ AX, CX
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R11, AX
|
|
MULQ R15
|
|
ADDQ BP, R8
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R11, AX
|
|
MULQ DI
|
|
ADDQ BP, R9
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R11, AX
|
|
MULQ SI
|
|
ADDQ BP, R10
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R11
|
|
MOVQ R12, AX
|
|
MULQ R14
|
|
ADDQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R12, AX
|
|
MULQ R15
|
|
ADDQ BP, R9
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R12, AX
|
|
MULQ DI
|
|
ADDQ BP, R10
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R12, AX
|
|
MULQ SI
|
|
ADDQ BP, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R12
|
|
MOVQ R13, AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R13, AX
|
|
MULQ R15
|
|
ADDQ BP, R10
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R10
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R13, AX
|
|
MULQ DI
|
|
ADDQ BP, R11
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R11
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R13, AX
|
|
MULQ SI
|
|
ADDQ BP, R12
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R12
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R13
|
|
|
|
// First reduction step
|
|
MOVQ BX, AX
|
|
MOVQ BX, BP
|
|
SHLQ $0x20, BX
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, BP
|
|
ADDQ BX, CX
|
|
ADCQ BP, R8
|
|
ADCQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BX
|
|
|
|
// Second reduction step
|
|
MOVQ CX, AX
|
|
MOVQ CX, BP
|
|
SHLQ $0x20, CX
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, BP
|
|
ADDQ CX, R8
|
|
ADCQ BP, R9
|
|
ADCQ AX, BX
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, CX
|
|
|
|
// Third reduction step
|
|
MOVQ R8, AX
|
|
MOVQ R8, BP
|
|
SHLQ $0x20, R8
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, BP
|
|
ADDQ R8, R9
|
|
ADCQ BP, BX
|
|
ADCQ AX, CX
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R8
|
|
|
|
// Last reduction step
|
|
MOVQ R9, AX
|
|
MOVQ R9, BP
|
|
SHLQ $0x20, R9
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, BP
|
|
ADDQ R9, BX
|
|
ADCQ BP, CX
|
|
ADCQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R9
|
|
MOVQ $0x00000000, BP
|
|
|
|
// Add bits [511:256] of the result
|
|
ADCQ BX, R10
|
|
ADCQ CX, R11
|
|
ADCQ R8, R12
|
|
ADCQ R9, R13
|
|
ADCQ $0x00, BP
|
|
|
|
// Copy result
|
|
MOVQ R10, BX
|
|
MOVQ R11, CX
|
|
MOVQ R12, R8
|
|
MOVQ R13, R9
|
|
|
|
// Subtract p256
|
|
SUBQ $-1, R10
|
|
SBBQ p256const0<>+0(SB), R11
|
|
SBBQ $0x00, R12
|
|
SBBQ p256const1<>+0(SB), R13
|
|
SBBQ $0x00, BP
|
|
|
|
// If the result of the subtraction is negative, restore the previous result
|
|
CMOVQCS BX, R10
|
|
CMOVQCS CX, R11
|
|
CMOVQCS R8, R12
|
|
CMOVQCS R9, R13
|
|
RET
|
|
|
|
// func p256SqrInternal()
|
|
// Requires: CMOV
|
|
TEXT p256SqrInternal<>(SB), NOSPLIT, $8
|
|
MOVQ R10, AX
|
|
MULQ R11
|
|
MOVQ AX, CX
|
|
MOVQ DX, R8
|
|
MOVQ R10, AX
|
|
MULQ R12
|
|
ADDQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R9
|
|
MOVQ R10, AX
|
|
MULQ R13
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R14
|
|
MOVQ R11, AX
|
|
MULQ R12
|
|
ADDQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BP
|
|
MOVQ R11, AX
|
|
MULQ R13
|
|
ADDQ BP, R14
|
|
ADCQ $0x00, DX
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R15
|
|
MOVQ R12, AX
|
|
MULQ R13
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, DI
|
|
XORQ SI, SI
|
|
|
|
// *2
|
|
ADDQ CX, CX
|
|
ADCQ R8, R8
|
|
ADCQ R9, R9
|
|
ADCQ R14, R14
|
|
ADCQ R15, R15
|
|
ADCQ DI, DI
|
|
ADCQ $0x00, SI
|
|
|
|
// Missing products
|
|
MOVQ R10, AX
|
|
MULQ AX
|
|
MOVQ AX, BX
|
|
MOVQ DX, R10
|
|
MOVQ R11, AX
|
|
MULQ AX
|
|
ADDQ R10, CX
|
|
ADCQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ R12, AX
|
|
MULQ AX
|
|
ADDQ R10, R9
|
|
ADCQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ R13, AX
|
|
MULQ AX
|
|
ADDQ R10, R15
|
|
ADCQ AX, DI
|
|
ADCQ DX, SI
|
|
|
|
// First reduction step
|
|
MOVQ BX, AX
|
|
MOVQ BX, BP
|
|
SHLQ $0x20, BX
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, BP
|
|
ADDQ BX, CX
|
|
ADCQ BP, R8
|
|
ADCQ AX, R9
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, BX
|
|
|
|
// Second reduction step
|
|
MOVQ CX, AX
|
|
MOVQ CX, BP
|
|
SHLQ $0x20, CX
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, BP
|
|
ADDQ CX, R8
|
|
ADCQ BP, R9
|
|
ADCQ AX, BX
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, CX
|
|
|
|
// Third reduction step
|
|
MOVQ R8, AX
|
|
MOVQ R8, BP
|
|
SHLQ $0x20, R8
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, BP
|
|
ADDQ R8, R9
|
|
ADCQ BP, BX
|
|
ADCQ AX, CX
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R8
|
|
|
|
// Last reduction step
|
|
MOVQ R9, AX
|
|
MOVQ R9, BP
|
|
SHLQ $0x20, R9
|
|
MULQ p256const1<>+0(SB)
|
|
SHRQ $0x20, BP
|
|
ADDQ R9, BX
|
|
ADCQ BP, CX
|
|
ADCQ AX, R8
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R9
|
|
MOVQ $0x00000000, BP
|
|
|
|
// Add bits [511:256] of the result
|
|
ADCQ BX, R14
|
|
ADCQ CX, R15
|
|
ADCQ R8, DI
|
|
ADCQ R9, SI
|
|
ADCQ $0x00, BP
|
|
|
|
// Copy result
|
|
MOVQ R14, R10
|
|
MOVQ R15, R11
|
|
MOVQ DI, R12
|
|
MOVQ SI, R13
|
|
|
|
// Subtract p256
|
|
SUBQ $-1, R10
|
|
SBBQ p256const0<>+0(SB), R11
|
|
SBBQ $0x00, R12
|
|
SBBQ p256const1<>+0(SB), R13
|
|
SBBQ $0x00, BP
|
|
|
|
// If the result of the subtraction is negative, restore the previous result
|
|
CMOVQCS R14, R10
|
|
CMOVQCS R15, R11
|
|
CMOVQCS DI, R12
|
|
CMOVQCS SI, R13
|
|
RET
|
|
|
|
// func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int)
|
|
// Requires: CMOV, SSE2
|
|
TEXT ·p256PointAddAffineAsm(SB), $512-48
|
|
MOVQ res+0(FP), AX
|
|
MOVQ in1+8(FP), BX
|
|
MOVQ in2+16(FP), CX
|
|
MOVQ sign+24(FP), DX
|
|
MOVQ sel+32(FP), R15
|
|
MOVQ zero+40(FP), DI
|
|
MOVOU (BX), X0
|
|
MOVOU 16(BX), X1
|
|
MOVOU 32(BX), X2
|
|
MOVOU 48(BX), X3
|
|
MOVOU 64(BX), X4
|
|
MOVOU 80(BX), X5
|
|
MOVOU X0, (SP)
|
|
MOVOU X1, 16(SP)
|
|
MOVOU X2, 32(SP)
|
|
MOVOU X3, 48(SP)
|
|
MOVOU X4, 64(SP)
|
|
MOVOU X5, 80(SP)
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU X0, 96(SP)
|
|
MOVOU X1, 112(SP)
|
|
|
|
// Store pointer to result
|
|
MOVQ AX, 480(SP)
|
|
MOVL R15, 488(SP)
|
|
MOVL DI, 492(SP)
|
|
|
|
// Negate y2in based on sign
|
|
MOVQ 32(CX), R10
|
|
MOVQ 40(CX), R11
|
|
MOVQ 48(CX), R12
|
|
MOVQ 56(CX), R13
|
|
MOVQ $-1, BX
|
|
MOVQ p256const0<>+0(SB), CX
|
|
MOVQ $0x00000000, R8
|
|
MOVQ p256const1<>+0(SB), R9
|
|
XORQ AX, AX
|
|
|
|
// Speculatively subtract
|
|
SUBQ R10, BX
|
|
SBBQ R11, CX
|
|
SBBQ R12, R8
|
|
SBBQ R13, R9
|
|
SBBQ $0x00, AX
|
|
MOVQ BX, R14
|
|
MOVQ CX, R15
|
|
MOVQ R8, DI
|
|
MOVQ R9, SI
|
|
|
|
// Add in case the operand was > p256
|
|
ADDQ $-1, BX
|
|
ADCQ p256const0<>+0(SB), CX
|
|
ADCQ $0x00, R8
|
|
ADCQ p256const1<>+0(SB), R9
|
|
ADCQ $0x00, AX
|
|
CMOVQNE R14, BX
|
|
CMOVQNE R15, CX
|
|
CMOVQNE DI, R8
|
|
CMOVQNE SI, R9
|
|
|
|
// If condition is 0, keep original value
|
|
TESTQ DX, DX
|
|
CMOVQEQ R10, BX
|
|
CMOVQEQ R11, CX
|
|
CMOVQEQ R12, R8
|
|
CMOVQEQ R13, R9
|
|
|
|
// Store result
|
|
MOVQ BX, 128(SP)
|
|
MOVQ CX, 136(SP)
|
|
MOVQ R8, 144(SP)
|
|
MOVQ R9, 152(SP)
|
|
|
|
// Begin point add
|
|
MOVQ 64(SP), R10
|
|
MOVQ 72(SP), R11
|
|
MOVQ 80(SP), R12
|
|
MOVQ 88(SP), R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 288(SP)
|
|
MOVQ R11, 296(SP)
|
|
MOVQ R12, 304(SP)
|
|
MOVQ R13, 312(SP)
|
|
MOVQ 96(SP), R14
|
|
MOVQ 104(SP), R15
|
|
MOVQ 112(SP), DI
|
|
MOVQ 120(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ (SP), R14
|
|
MOVQ 8(SP), R15
|
|
MOVQ 16(SP), DI
|
|
MOVQ 24(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ R10, 320(SP)
|
|
MOVQ R11, 328(SP)
|
|
MOVQ R12, 336(SP)
|
|
MOVQ R13, 344(SP)
|
|
MOVQ 64(SP), R14
|
|
MOVQ 72(SP), R15
|
|
MOVQ 80(SP), DI
|
|
MOVQ 88(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 224(SP)
|
|
MOVQ R11, 232(SP)
|
|
MOVQ R12, 240(SP)
|
|
MOVQ R13, 248(SP)
|
|
MOVQ 288(SP), R10
|
|
MOVQ 296(SP), R11
|
|
MOVQ 304(SP), R12
|
|
MOVQ 312(SP), R13
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ 128(SP), R14
|
|
MOVQ 136(SP), R15
|
|
MOVQ 144(SP), DI
|
|
MOVQ 152(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 256(SP)
|
|
MOVQ R11, 264(SP)
|
|
MOVQ R12, 272(SP)
|
|
MOVQ R13, 280(SP)
|
|
MOVQ 32(SP), R14
|
|
MOVQ 40(SP), R15
|
|
MOVQ 48(SP), DI
|
|
MOVQ 56(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ R10, 352(SP)
|
|
MOVQ R11, 360(SP)
|
|
MOVQ R12, 368(SP)
|
|
MOVQ R13, 376(SP)
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 416(SP)
|
|
MOVQ R11, 424(SP)
|
|
MOVQ R12, 432(SP)
|
|
MOVQ R13, 440(SP)
|
|
MOVQ 320(SP), R10
|
|
MOVQ 328(SP), R11
|
|
MOVQ 336(SP), R12
|
|
MOVQ 344(SP), R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 384(SP)
|
|
MOVQ R11, 392(SP)
|
|
MOVQ R12, 400(SP)
|
|
MOVQ R13, 408(SP)
|
|
MOVQ 320(SP), R14
|
|
MOVQ 328(SP), R15
|
|
MOVQ 336(SP), DI
|
|
MOVQ 344(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 448(SP)
|
|
MOVQ R11, 456(SP)
|
|
MOVQ R12, 464(SP)
|
|
MOVQ R13, 472(SP)
|
|
MOVQ 32(SP), R14
|
|
MOVQ 40(SP), R15
|
|
MOVQ 48(SP), DI
|
|
MOVQ 56(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 256(SP)
|
|
MOVQ R11, 264(SP)
|
|
MOVQ R12, 272(SP)
|
|
MOVQ R13, 280(SP)
|
|
MOVQ (SP), R10
|
|
MOVQ 8(SP), R11
|
|
MOVQ 16(SP), R12
|
|
MOVQ 24(SP), R13
|
|
MOVQ 384(SP), R14
|
|
MOVQ 392(SP), R15
|
|
MOVQ 400(SP), DI
|
|
MOVQ 408(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 320(SP)
|
|
MOVQ R11, 328(SP)
|
|
MOVQ R12, 336(SP)
|
|
MOVQ R13, 344(SP)
|
|
XORQ AX, AX
|
|
ADDQ R10, R10
|
|
ADCQ R11, R11
|
|
ADCQ R12, R12
|
|
ADCQ R13, R13
|
|
ADCQ $+0, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
SUBQ $-1, R14
|
|
SBBQ p256const0<>+0(SB), R15
|
|
SBBQ $+0, DI
|
|
SBBQ p256const1<>+0(SB), SI
|
|
SBBQ $+0, AX
|
|
CMOVQCS R10, R14
|
|
CMOVQCS R11, R15
|
|
CMOVQCS R12, DI
|
|
CMOVQCS R13, SI
|
|
MOVQ 416(SP), R10
|
|
MOVQ 424(SP), R11
|
|
MOVQ 432(SP), R12
|
|
MOVQ 440(SP), R13
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ 448(SP), R14
|
|
MOVQ 456(SP), R15
|
|
MOVQ 464(SP), DI
|
|
MOVQ 472(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ R10, 160(SP)
|
|
MOVQ R11, 168(SP)
|
|
MOVQ R12, 176(SP)
|
|
MOVQ R13, 184(SP)
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
MOVQ 320(SP), R10
|
|
MOVQ 328(SP), R11
|
|
MOVQ 336(SP), R12
|
|
MOVQ 344(SP), R13
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ 352(SP), R14
|
|
MOVQ 360(SP), R15
|
|
MOVQ 368(SP), DI
|
|
MOVQ 376(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ 256(SP), R14
|
|
MOVQ 264(SP), R15
|
|
MOVQ 272(SP), DI
|
|
MOVQ 280(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ R10, 192(SP)
|
|
MOVQ R11, 200(SP)
|
|
MOVQ R12, 208(SP)
|
|
MOVQ R13, 216(SP)
|
|
|
|
// Load stored values from stack
|
|
MOVQ 480(SP), AX
|
|
MOVL 488(SP), BX
|
|
MOVL 492(SP), CX
|
|
|
|
// The result is not valid if (sel == 0), conditional choose
|
|
MOVOU 160(SP), X0
|
|
MOVOU 176(SP), X1
|
|
MOVOU 192(SP), X2
|
|
MOVOU 208(SP), X3
|
|
MOVOU 224(SP), X4
|
|
MOVOU 240(SP), X5
|
|
MOVL BX, X6
|
|
MOVL CX, X7
|
|
PXOR X8, X8
|
|
PCMPEQL X9, X9
|
|
PSHUFD $0x00, X6, X6
|
|
PSHUFD $0x00, X7, X7
|
|
PCMPEQL X8, X6
|
|
PCMPEQL X8, X7
|
|
MOVOU X6, X15
|
|
PANDN X9, X15
|
|
MOVOU (SP), X9
|
|
MOVOU 16(SP), X10
|
|
MOVOU 32(SP), X11
|
|
MOVOU 48(SP), X12
|
|
MOVOU 64(SP), X13
|
|
MOVOU 80(SP), X14
|
|
PAND X15, X0
|
|
PAND X15, X1
|
|
PAND X15, X2
|
|
PAND X15, X3
|
|
PAND X15, X4
|
|
PAND X15, X5
|
|
PAND X6, X9
|
|
PAND X6, X10
|
|
PAND X6, X11
|
|
PAND X6, X12
|
|
PAND X6, X13
|
|
PAND X6, X14
|
|
PXOR X9, X0
|
|
PXOR X10, X1
|
|
PXOR X11, X2
|
|
PXOR X12, X3
|
|
PXOR X13, X4
|
|
PXOR X14, X5
|
|
|
|
// Similarly if zero == 0
|
|
PCMPEQL X9, X9
|
|
MOVOU X7, X15
|
|
PANDN X9, X15
|
|
MOVOU 96(SP), X9
|
|
MOVOU 112(SP), X10
|
|
MOVOU 128(SP), X11
|
|
MOVOU 144(SP), X12
|
|
MOVOU p256one<>+0(SB), X13
|
|
MOVOU p256one<>+16(SB), X14
|
|
PAND X15, X0
|
|
PAND X15, X1
|
|
PAND X15, X2
|
|
PAND X15, X3
|
|
PAND X15, X4
|
|
PAND X15, X5
|
|
PAND X7, X9
|
|
PAND X7, X10
|
|
PAND X7, X11
|
|
PAND X7, X12
|
|
PAND X7, X13
|
|
PAND X7, X14
|
|
PXOR X9, X0
|
|
PXOR X10, X1
|
|
PXOR X11, X2
|
|
PXOR X12, X3
|
|
PXOR X13, X4
|
|
PXOR X14, X5
|
|
|
|
// Finally output the result
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, 32(AX)
|
|
MOVOU X3, 48(AX)
|
|
MOVOU X4, 64(AX)
|
|
MOVOU X5, 80(AX)
|
|
MOVQ $0x00000000, 480(SP)
|
|
RET
|
|
|
|
DATA p256one<>+0(SB)/8, $0x0000000000000001
|
|
DATA p256one<>+8(SB)/8, $0xffffffff00000000
|
|
DATA p256one<>+16(SB)/8, $0xffffffffffffffff
|
|
DATA p256one<>+24(SB)/8, $0x00000000fffffffe
|
|
GLOBL p256one<>(SB), RODATA, $32
|
|
|
|
// func p256IsZero()
|
|
// Requires: CMOV
|
|
TEXT p256IsZero<>(SB), NOSPLIT, $0
|
|
// AX contains a flag that is set if the input is zero.
|
|
XORQ AX, AX
|
|
MOVQ $0x00000001, R15
|
|
|
|
// Check whether [acc4..acc7] are all zero.
|
|
MOVQ R10, R14
|
|
ORQ R11, R14
|
|
ORQ R12, R14
|
|
ORQ R13, R14
|
|
|
|
// Set the zero flag if so. (CMOV of a constant to a register doesn't
|
|
// appear to be supported in Go. Thus t1 = 1.)
|
|
CMOVQEQ R15, AX
|
|
|
|
// XOR [acc4..acc7] with P and compare with zero again.
|
|
XORQ $-1, R10
|
|
XORQ p256const0<>+0(SB), R11
|
|
XORQ p256const1<>+0(SB), R13
|
|
ORQ R11, R10
|
|
ORQ R12, R10
|
|
ORQ R13, R10
|
|
|
|
// Set the zero flag if so.
|
|
CMOVQEQ R15, AX
|
|
RET
|
|
|
|
// func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int
|
|
// Requires: CMOV, SSE2
|
|
TEXT ·p256PointAddAsm(SB), $680-32
|
|
// Move input to stack in order to free registers
|
|
MOVQ res+0(FP), AX
|
|
MOVQ in1+8(FP), BX
|
|
MOVQ in2+16(FP), CX
|
|
MOVOU (BX), X0
|
|
MOVOU 16(BX), X1
|
|
MOVOU 32(BX), X2
|
|
MOVOU 48(BX), X3
|
|
MOVOU 64(BX), X4
|
|
MOVOU 80(BX), X5
|
|
MOVOU X0, (SP)
|
|
MOVOU X1, 16(SP)
|
|
MOVOU X2, 32(SP)
|
|
MOVOU X3, 48(SP)
|
|
MOVOU X4, 64(SP)
|
|
MOVOU X5, 80(SP)
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU 32(CX), X2
|
|
MOVOU 48(CX), X3
|
|
MOVOU 64(CX), X4
|
|
MOVOU 80(CX), X5
|
|
MOVOU X0, 96(SP)
|
|
MOVOU X1, 112(SP)
|
|
MOVOU X2, 128(SP)
|
|
MOVOU X3, 144(SP)
|
|
MOVOU X4, 160(SP)
|
|
MOVOU X5, 176(SP)
|
|
|
|
// Store pointer to result
|
|
MOVQ AX, 640(SP)
|
|
|
|
// Begin point add
|
|
MOVQ 160(SP), R10
|
|
MOVQ 168(SP), R11
|
|
MOVQ 176(SP), R12
|
|
MOVQ 184(SP), R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 448(SP)
|
|
MOVQ R11, 456(SP)
|
|
MOVQ R12, 464(SP)
|
|
MOVQ R13, 472(SP)
|
|
MOVQ 160(SP), R14
|
|
MOVQ 168(SP), R15
|
|
MOVQ 176(SP), DI
|
|
MOVQ 184(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ 32(SP), R14
|
|
MOVQ 40(SP), R15
|
|
MOVQ 48(SP), DI
|
|
MOVQ 56(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 352(SP)
|
|
MOVQ R11, 360(SP)
|
|
MOVQ R12, 368(SP)
|
|
MOVQ R13, 376(SP)
|
|
MOVQ 64(SP), R10
|
|
MOVQ 72(SP), R11
|
|
MOVQ 80(SP), R12
|
|
MOVQ 88(SP), R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 416(SP)
|
|
MOVQ R11, 424(SP)
|
|
MOVQ R12, 432(SP)
|
|
MOVQ R13, 440(SP)
|
|
MOVQ 64(SP), R14
|
|
MOVQ 72(SP), R15
|
|
MOVQ 80(SP), DI
|
|
MOVQ 88(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ 128(SP), R14
|
|
MOVQ 136(SP), R15
|
|
MOVQ 144(SP), DI
|
|
MOVQ 152(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 384(SP)
|
|
MOVQ R11, 392(SP)
|
|
MOVQ R12, 400(SP)
|
|
MOVQ R13, 408(SP)
|
|
MOVQ 352(SP), R14
|
|
MOVQ 360(SP), R15
|
|
MOVQ 368(SP), DI
|
|
MOVQ 376(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ R10, 512(SP)
|
|
MOVQ R11, 520(SP)
|
|
MOVQ R12, 528(SP)
|
|
MOVQ R13, 536(SP)
|
|
CALL p256IsZero<>(SB)
|
|
MOVQ AX, 648(SP)
|
|
MOVQ 448(SP), R10
|
|
MOVQ 456(SP), R11
|
|
MOVQ 464(SP), R12
|
|
MOVQ 472(SP), R13
|
|
MOVQ (SP), R14
|
|
MOVQ 8(SP), R15
|
|
MOVQ 16(SP), DI
|
|
MOVQ 24(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 288(SP)
|
|
MOVQ R11, 296(SP)
|
|
MOVQ R12, 304(SP)
|
|
MOVQ R13, 312(SP)
|
|
MOVQ 416(SP), R10
|
|
MOVQ 424(SP), R11
|
|
MOVQ 432(SP), R12
|
|
MOVQ 440(SP), R13
|
|
MOVQ 96(SP), R14
|
|
MOVQ 104(SP), R15
|
|
MOVQ 112(SP), DI
|
|
MOVQ 120(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 320(SP)
|
|
MOVQ R11, 328(SP)
|
|
MOVQ R12, 336(SP)
|
|
MOVQ R13, 344(SP)
|
|
MOVQ 288(SP), R14
|
|
MOVQ 296(SP), R15
|
|
MOVQ 304(SP), DI
|
|
MOVQ 312(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ R10, 480(SP)
|
|
MOVQ R11, 488(SP)
|
|
MOVQ R12, 496(SP)
|
|
MOVQ R13, 504(SP)
|
|
CALL p256IsZero<>(SB)
|
|
ANDQ 648(SP), AX
|
|
MOVQ AX, 648(SP)
|
|
MOVQ 512(SP), R10
|
|
MOVQ 520(SP), R11
|
|
MOVQ 528(SP), R12
|
|
MOVQ 536(SP), R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 576(SP)
|
|
MOVQ R11, 584(SP)
|
|
MOVQ R12, 592(SP)
|
|
MOVQ R13, 600(SP)
|
|
MOVQ 480(SP), R10
|
|
MOVQ 488(SP), R11
|
|
MOVQ 496(SP), R12
|
|
MOVQ 504(SP), R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 544(SP)
|
|
MOVQ R11, 552(SP)
|
|
MOVQ R12, 560(SP)
|
|
MOVQ R13, 568(SP)
|
|
MOVQ 480(SP), R14
|
|
MOVQ 488(SP), R15
|
|
MOVQ 496(SP), DI
|
|
MOVQ 504(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 608(SP)
|
|
MOVQ R11, 616(SP)
|
|
MOVQ R12, 624(SP)
|
|
MOVQ R13, 632(SP)
|
|
MOVQ 352(SP), R14
|
|
MOVQ 360(SP), R15
|
|
MOVQ 368(SP), DI
|
|
MOVQ 376(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 384(SP)
|
|
MOVQ R11, 392(SP)
|
|
MOVQ R12, 400(SP)
|
|
MOVQ R13, 408(SP)
|
|
MOVQ 64(SP), R10
|
|
MOVQ 72(SP), R11
|
|
MOVQ 80(SP), R12
|
|
MOVQ 88(SP), R13
|
|
MOVQ 160(SP), R14
|
|
MOVQ 168(SP), R15
|
|
MOVQ 176(SP), DI
|
|
MOVQ 184(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ 480(SP), R14
|
|
MOVQ 488(SP), R15
|
|
MOVQ 496(SP), DI
|
|
MOVQ 504(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 256(SP)
|
|
MOVQ R11, 264(SP)
|
|
MOVQ R12, 272(SP)
|
|
MOVQ R13, 280(SP)
|
|
MOVQ 544(SP), R10
|
|
MOVQ 552(SP), R11
|
|
MOVQ 560(SP), R12
|
|
MOVQ 568(SP), R13
|
|
MOVQ 288(SP), R14
|
|
MOVQ 296(SP), R15
|
|
MOVQ 304(SP), DI
|
|
MOVQ 312(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 320(SP)
|
|
MOVQ R11, 328(SP)
|
|
MOVQ R12, 336(SP)
|
|
MOVQ R13, 344(SP)
|
|
XORQ AX, AX
|
|
ADDQ R10, R10
|
|
ADCQ R11, R11
|
|
ADCQ R12, R12
|
|
ADCQ R13, R13
|
|
ADCQ $+0, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
SUBQ $-1, R14
|
|
SBBQ p256const0<>+0(SB), R15
|
|
SBBQ $+0, DI
|
|
SBBQ p256const1<>+0(SB), SI
|
|
SBBQ $+0, AX
|
|
CMOVQCS R10, R14
|
|
CMOVQCS R11, R15
|
|
CMOVQCS R12, DI
|
|
CMOVQCS R13, SI
|
|
MOVQ 576(SP), R10
|
|
MOVQ 584(SP), R11
|
|
MOVQ 592(SP), R12
|
|
MOVQ 600(SP), R13
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ 608(SP), R14
|
|
MOVQ 616(SP), R15
|
|
MOVQ 624(SP), DI
|
|
MOVQ 632(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ R10, 192(SP)
|
|
MOVQ R11, 200(SP)
|
|
MOVQ R12, 208(SP)
|
|
MOVQ R13, 216(SP)
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
MOVQ 320(SP), R10
|
|
MOVQ 328(SP), R11
|
|
MOVQ 336(SP), R12
|
|
MOVQ 344(SP), R13
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ 512(SP), R14
|
|
MOVQ 520(SP), R15
|
|
MOVQ 528(SP), DI
|
|
MOVQ 536(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ 384(SP), R14
|
|
MOVQ 392(SP), R15
|
|
MOVQ 400(SP), DI
|
|
MOVQ 408(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ R10, 224(SP)
|
|
MOVQ R11, 232(SP)
|
|
MOVQ R12, 240(SP)
|
|
MOVQ R13, 248(SP)
|
|
MOVOU 192(SP), X0
|
|
MOVOU 208(SP), X1
|
|
MOVOU 224(SP), X2
|
|
MOVOU 240(SP), X3
|
|
MOVOU 256(SP), X4
|
|
MOVOU 272(SP), X5
|
|
|
|
// Finally output the result
|
|
MOVQ 640(SP), AX
|
|
MOVQ $0x00000000, 640(SP)
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, 32(AX)
|
|
MOVOU X3, 48(AX)
|
|
MOVOU X4, 64(AX)
|
|
MOVOU X5, 80(AX)
|
|
MOVQ 648(SP), AX
|
|
MOVQ AX, ret+24(FP)
|
|
RET
|
|
|
|
// func p256PointDoubleAsm(res *P256Point, in *P256Point)
|
|
// Requires: CMOV, SSE2
|
|
TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16
|
|
MOVQ res+0(FP), AX
|
|
MOVQ in+8(FP), BX
|
|
MOVOU (BX), X0
|
|
MOVOU 16(BX), X1
|
|
MOVOU 32(BX), X2
|
|
MOVOU 48(BX), X3
|
|
MOVOU 64(BX), X4
|
|
MOVOU 80(BX), X5
|
|
MOVOU X0, (SP)
|
|
MOVOU X1, 16(SP)
|
|
MOVOU X2, 32(SP)
|
|
MOVOU X3, 48(SP)
|
|
MOVOU X4, 64(SP)
|
|
MOVOU X5, 80(SP)
|
|
|
|
// Store pointer to result
|
|
MOVQ AX, 224(SP)
|
|
|
|
// Begin point double
|
|
MOVQ 64(SP), R10
|
|
MOVQ 72(SP), R11
|
|
MOVQ 80(SP), R12
|
|
MOVQ 88(SP), R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 160(SP)
|
|
MOVQ R11, 168(SP)
|
|
MOVQ R12, 176(SP)
|
|
MOVQ R13, 184(SP)
|
|
MOVQ (SP), R14
|
|
MOVQ 8(SP), R15
|
|
MOVQ 16(SP), DI
|
|
MOVQ 24(SP), SI
|
|
XORQ AX, AX
|
|
ADDQ R14, R10
|
|
ADCQ R15, R11
|
|
ADCQ DI, R12
|
|
ADCQ SI, R13
|
|
ADCQ $+0, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
SUBQ $-1, R14
|
|
SBBQ p256const0<>+0(SB), R15
|
|
SBBQ $+0, DI
|
|
SBBQ p256const1<>+0(SB), SI
|
|
SBBQ $+0, AX
|
|
CMOVQCS R10, R14
|
|
CMOVQCS R11, R15
|
|
CMOVQCS R12, DI
|
|
CMOVQCS R13, SI
|
|
MOVQ R14, 128(SP)
|
|
MOVQ R15, 136(SP)
|
|
MOVQ DI, 144(SP)
|
|
MOVQ SI, 152(SP)
|
|
MOVQ 64(SP), R10
|
|
MOVQ 72(SP), R11
|
|
MOVQ 80(SP), R12
|
|
MOVQ 88(SP), R13
|
|
MOVQ 32(SP), R14
|
|
MOVQ 40(SP), R15
|
|
MOVQ 48(SP), DI
|
|
MOVQ 56(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
XORQ AX, AX
|
|
ADDQ R10, R10
|
|
ADCQ R11, R11
|
|
ADCQ R12, R12
|
|
ADCQ R13, R13
|
|
ADCQ $+0, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
SUBQ $-1, R14
|
|
SBBQ p256const0<>+0(SB), R15
|
|
SBBQ $+0, DI
|
|
SBBQ p256const1<>+0(SB), SI
|
|
SBBQ $+0, AX
|
|
CMOVQCS R10, R14
|
|
CMOVQCS R11, R15
|
|
CMOVQCS R12, DI
|
|
CMOVQCS R13, SI
|
|
MOVQ 224(SP), AX
|
|
|
|
// Store z
|
|
MOVQ R14, 64(AX)
|
|
MOVQ R15, 72(AX)
|
|
MOVQ DI, 80(AX)
|
|
MOVQ SI, 88(AX)
|
|
MOVQ (SP), R10
|
|
MOVQ 8(SP), R11
|
|
MOVQ 16(SP), R12
|
|
MOVQ 24(SP), R13
|
|
MOVQ 160(SP), R14
|
|
MOVQ 168(SP), R15
|
|
MOVQ 176(SP), DI
|
|
MOVQ 184(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ 128(SP), R14
|
|
MOVQ 136(SP), R15
|
|
MOVQ 144(SP), DI
|
|
MOVQ 152(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 128(SP)
|
|
MOVQ R11, 136(SP)
|
|
MOVQ R12, 144(SP)
|
|
MOVQ R13, 152(SP)
|
|
|
|
// Multiply by 3
|
|
XORQ AX, AX
|
|
ADDQ R10, R10
|
|
ADCQ R11, R11
|
|
ADCQ R12, R12
|
|
ADCQ R13, R13
|
|
ADCQ $+0, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
SUBQ $-1, R14
|
|
SBBQ p256const0<>+0(SB), R15
|
|
SBBQ $+0, DI
|
|
SBBQ p256const1<>+0(SB), SI
|
|
SBBQ $+0, AX
|
|
CMOVQCS R10, R14
|
|
CMOVQCS R11, R15
|
|
CMOVQCS R12, DI
|
|
CMOVQCS R13, SI
|
|
MOVQ 128(SP), R10
|
|
MOVQ 136(SP), R11
|
|
MOVQ 144(SP), R12
|
|
MOVQ 152(SP), R13
|
|
XORQ AX, AX
|
|
ADDQ R14, R10
|
|
ADCQ R15, R11
|
|
ADCQ DI, R12
|
|
ADCQ SI, R13
|
|
ADCQ $+0, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
SUBQ $-1, R14
|
|
SBBQ p256const0<>+0(SB), R15
|
|
SBBQ $+0, DI
|
|
SBBQ p256const1<>+0(SB), SI
|
|
SBBQ $+0, AX
|
|
CMOVQCS R10, R14
|
|
CMOVQCS R11, R15
|
|
CMOVQCS R12, DI
|
|
CMOVQCS R13, SI
|
|
MOVQ R14, 128(SP)
|
|
MOVQ R15, 136(SP)
|
|
MOVQ DI, 144(SP)
|
|
MOVQ SI, 152(SP)
|
|
|
|
// ////////////////////////
|
|
MOVQ 32(SP), R10
|
|
MOVQ 40(SP), R11
|
|
MOVQ 48(SP), R12
|
|
MOVQ 56(SP), R13
|
|
XORQ AX, AX
|
|
ADDQ R10, R10
|
|
ADCQ R11, R11
|
|
ADCQ R12, R12
|
|
ADCQ R13, R13
|
|
ADCQ $+0, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
SUBQ $-1, R14
|
|
SBBQ p256const0<>+0(SB), R15
|
|
SBBQ $+0, DI
|
|
SBBQ p256const1<>+0(SB), SI
|
|
SBBQ $+0, AX
|
|
CMOVQCS R10, R14
|
|
CMOVQCS R11, R15
|
|
CMOVQCS R12, DI
|
|
CMOVQCS R13, SI
|
|
MOVQ R14, R10
|
|
MOVQ R15, R11
|
|
MOVQ DI, R12
|
|
MOVQ SI, R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ R10, 96(SP)
|
|
MOVQ R11, 104(SP)
|
|
MOVQ R12, 112(SP)
|
|
MOVQ R13, 120(SP)
|
|
CALL p256SqrInternal<>(SB)
|
|
|
|
// Divide by 2
|
|
XORQ AX, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
ADDQ $-1, R10
|
|
ADCQ p256const0<>+0(SB), R11
|
|
ADCQ $0x00, R12
|
|
ADCQ p256const1<>+0(SB), R13
|
|
ADCQ $0x00, AX
|
|
TESTQ $0x00000001, R14
|
|
CMOVQEQ R14, R10
|
|
CMOVQEQ R15, R11
|
|
CMOVQEQ DI, R12
|
|
CMOVQEQ SI, R13
|
|
ANDQ R14, AX
|
|
SHRQ $0x01, R11, R10
|
|
SHRQ $0x01, R12, R11
|
|
SHRQ $0x01, R13, R12
|
|
SHRQ $0x01, AX, R13
|
|
MOVQ R10, 32(SP)
|
|
MOVQ R11, 40(SP)
|
|
MOVQ R12, 48(SP)
|
|
MOVQ R13, 56(SP)
|
|
|
|
// /////////////////////////
|
|
MOVQ (SP), R10
|
|
MOVQ 8(SP), R11
|
|
MOVQ 16(SP), R12
|
|
MOVQ 24(SP), R13
|
|
MOVQ 96(SP), R14
|
|
MOVQ 104(SP), R15
|
|
MOVQ 112(SP), DI
|
|
MOVQ 120(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ R10, 96(SP)
|
|
MOVQ R11, 104(SP)
|
|
MOVQ R12, 112(SP)
|
|
MOVQ R13, 120(SP)
|
|
XORQ AX, AX
|
|
ADDQ R10, R10
|
|
ADCQ R11, R11
|
|
ADCQ R12, R12
|
|
ADCQ R13, R13
|
|
ADCQ $+0, AX
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
SUBQ $-1, R14
|
|
SBBQ p256const0<>+0(SB), R15
|
|
SBBQ $+0, DI
|
|
SBBQ p256const1<>+0(SB), SI
|
|
SBBQ $+0, AX
|
|
CMOVQCS R10, R14
|
|
CMOVQCS R11, R15
|
|
CMOVQCS R12, DI
|
|
CMOVQCS R13, SI
|
|
MOVQ R14, 192(SP)
|
|
MOVQ R15, 200(SP)
|
|
MOVQ DI, 208(SP)
|
|
MOVQ SI, 216(SP)
|
|
MOVQ 128(SP), R10
|
|
MOVQ 136(SP), R11
|
|
MOVQ 144(SP), R12
|
|
MOVQ 152(SP), R13
|
|
CALL p256SqrInternal<>(SB)
|
|
MOVQ 192(SP), R14
|
|
MOVQ 200(SP), R15
|
|
MOVQ 208(SP), DI
|
|
MOVQ 216(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ 224(SP), AX
|
|
|
|
// Store x
|
|
MOVQ R10, (AX)
|
|
MOVQ R11, 8(AX)
|
|
MOVQ R12, 16(AX)
|
|
MOVQ R13, 24(AX)
|
|
MOVQ R10, R14
|
|
MOVQ R11, R15
|
|
MOVQ R12, DI
|
|
MOVQ R13, SI
|
|
MOVQ 96(SP), R10
|
|
MOVQ 104(SP), R11
|
|
MOVQ 112(SP), R12
|
|
MOVQ 120(SP), R13
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ 128(SP), R14
|
|
MOVQ 136(SP), R15
|
|
MOVQ 144(SP), DI
|
|
MOVQ 152(SP), SI
|
|
CALL p256MulInternal<>(SB)
|
|
MOVQ 32(SP), R14
|
|
MOVQ 40(SP), R15
|
|
MOVQ 48(SP), DI
|
|
MOVQ 56(SP), SI
|
|
CALL p256SubInternal<>(SB)
|
|
MOVQ 224(SP), AX
|
|
|
|
// Store y
|
|
MOVQ R10, 32(AX)
|
|
MOVQ R11, 40(AX)
|
|
MOVQ R12, 48(AX)
|
|
MOVQ R13, 56(AX)
|
|
|
|
// ///////////////////////
|
|
MOVQ $0x00000000, 224(SP)
|
|
RET
|