//go:build !appengine && gc && !purego // +build !appengine // +build gc // +build !purego #include "textflag.h" // Registers: #define h AX #define d AX #define p SI // pointer to advance through b #define n DX #define end BX // loop end #define v1 R8 #define v2 R9 #define v3 R10 #define v4 R11 #define x R12 #define prime1 R13 #define prime2 R14 #define prime4 DI #define round(acc, x) \ IMULQ prime2, x \ ADDQ x, acc \ ROLQ $31, acc \ IMULQ prime1, acc // round0 performs the operation x = round(0, x). #define round0(x) \ IMULQ prime2, x \ ROLQ $31, x \ IMULQ prime1, x // mergeRound applies a merge round on the two registers acc and x. // It assumes that prime1, prime2, and prime4 have been loaded. #define mergeRound(acc, x) \ round0(x) \ XORQ x, acc \ IMULQ prime1, acc \ ADDQ prime4, acc // blockLoop processes as many 32-byte blocks as possible, // updating v1, v2, v3, and v4. It assumes that there is at least one block // to process. #define blockLoop() \ loop: \ MOVQ +0(p), x \ round(v1, x) \ MOVQ +8(p), x \ round(v2, x) \ MOVQ +16(p), x \ round(v3, x) \ MOVQ +24(p), x \ round(v4, x) \ ADDQ $32, p \ CMPQ p, end \ JLE loop // func Sum64(b []byte) uint64 TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 // Load fixed primes. MOVQ ·primes+0(SB), prime1 MOVQ ·primes+8(SB), prime2 MOVQ ·primes+24(SB), prime4 // Load slice. MOVQ b_base+0(FP), p MOVQ b_len+8(FP), n LEAQ (p)(n*1), end // The first loop limit will be len(b)-32. SUBQ $32, end // Check whether we have at least one block. CMPQ n, $32 JLT noBlocks // Set up initial state (v1, v2, v3, v4). MOVQ prime1, v1 ADDQ prime2, v1 MOVQ prime2, v2 XORQ v3, v3 XORQ v4, v4 SUBQ prime1, v4 blockLoop() MOVQ v1, h ROLQ $1, h MOVQ v2, x ROLQ $7, x ADDQ x, h MOVQ v3, x ROLQ $12, x ADDQ x, h MOVQ v4, x ROLQ $18, x ADDQ x, h mergeRound(h, v1) mergeRound(h, v2) mergeRound(h, v3) mergeRound(h, v4) JMP afterBlocks noBlocks: MOVQ ·primes+32(SB), h afterBlocks: ADDQ n, h ADDQ $24, end CMPQ p, end JG try4 loop8: MOVQ (p), x ADDQ $8, p round0(x) XORQ x, h ROLQ $27, h IMULQ prime1, h ADDQ prime4, h CMPQ p, end JLE loop8 try4: ADDQ $4, end CMPQ p, end JG try1 MOVL (p), x ADDQ $4, p IMULQ prime1, x XORQ x, h ROLQ $23, h IMULQ prime2, h ADDQ ·primes+16(SB), h try1: ADDQ $4, end CMPQ p, end JGE finalize loop1: MOVBQZX (p), x ADDQ $1, p IMULQ ·primes+32(SB), x XORQ x, h ROLQ $11, h IMULQ prime1, h CMPQ p, end JL loop1 finalize: MOVQ h, x SHRQ $33, x XORQ x, h IMULQ prime2, h MOVQ h, x SHRQ $29, x XORQ x, h IMULQ ·primes+16(SB), h MOVQ h, x SHRQ $32, x XORQ x, h MOVQ h, ret+24(FP) RET // func writeBlocks(d *Digest, b []byte) int TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 // Load fixed primes needed for round. MOVQ ·primes+0(SB), prime1 MOVQ ·primes+8(SB), prime2 // Load slice. MOVQ b_base+8(FP), p MOVQ b_len+16(FP), n LEAQ (p)(n*1), end SUBQ $32, end // Load vN from d. MOVQ s+0(FP), d MOVQ 0(d), v1 MOVQ 8(d), v2 MOVQ 16(d), v3 MOVQ 24(d), v4 // We don't need to check the loop condition here; this function is // always called with at least one block of data to process. blockLoop() // Copy vN back to d. MOVQ v1, 0(d) MOVQ v2, 8(d) MOVQ v3, 16(d) MOVQ v4, 24(d) // The number of bytes written is p minus the old base pointer. SUBQ b_base+8(FP), p MOVQ p, ret+32(FP) RET