mirror of https://github.com/docker/cli.git
vendor: github.com/cespare/xxhash/v2 v2.2.0
full diff: https://github.com/cespare/xxhash/compare/v2.1.2...v2.2.0 Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
parent
c1d0657029
commit
5b138189b9
|
@ -50,7 +50,7 @@ require (
|
|||
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
|
||||
github.com/Microsoft/go-winio v0.5.2 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.1.2 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.2.0 // indirect
|
||||
github.com/docker/go v1.5.1-1.0.20160303222718-d30aec9fd63c // indirect
|
||||
github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect
|
||||
github.com/docker/go-metrics v0.0.1 // indirect
|
||||
|
|
|
@ -67,8 +67,9 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
|
|||
github.com/certifi/gocertifi v0.0.0-20191021191039-0944d244cd40/go.mod h1:sGbDF6GwGcLpkNXPUTkMRoywsNa/ol15pxFe6ERfguA=
|
||||
github.com/certifi/gocertifi v0.0.0-20200922220541-2c3bb06c6054/go.mod h1:sGbDF6GwGcLpkNXPUTkMRoywsNa/ol15pxFe6ERfguA=
|
||||
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE=
|
||||
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
|
||||
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
|
||||
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
|
||||
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
|
||||
|
|
|
@ -3,8 +3,7 @@
|
|||
[![Go Reference](https://pkg.go.dev/badge/github.com/cespare/xxhash/v2.svg)](https://pkg.go.dev/github.com/cespare/xxhash/v2)
|
||||
[![Test](https://github.com/cespare/xxhash/actions/workflows/test.yml/badge.svg)](https://github.com/cespare/xxhash/actions/workflows/test.yml)
|
||||
|
||||
xxhash is a Go implementation of the 64-bit
|
||||
[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
|
||||
xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
|
||||
high-quality hashing algorithm that is much faster than anything in the Go
|
||||
standard library.
|
||||
|
||||
|
@ -25,8 +24,11 @@ func (*Digest) WriteString(string) (int, error)
|
|||
func (*Digest) Sum64() uint64
|
||||
```
|
||||
|
||||
This implementation provides a fast pure-Go implementation and an even faster
|
||||
assembly implementation for amd64.
|
||||
The package is written with optimized pure Go and also contains even faster
|
||||
assembly implementations for amd64 and arm64. If desired, the `purego` build tag
|
||||
opts into using the Go code even on those architectures.
|
||||
|
||||
[xxHash]: http://cyan4973.github.io/xxHash/
|
||||
|
||||
## Compatibility
|
||||
|
||||
|
@ -45,19 +47,20 @@ I recommend using the latest release of Go.
|
|||
Here are some quick benchmarks comparing the pure-Go and assembly
|
||||
implementations of Sum64.
|
||||
|
||||
| input size | purego | asm |
|
||||
| --- | --- | --- |
|
||||
| 5 B | 979.66 MB/s | 1291.17 MB/s |
|
||||
| 100 B | 7475.26 MB/s | 7973.40 MB/s |
|
||||
| 4 KB | 17573.46 MB/s | 17602.65 MB/s |
|
||||
| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
|
||||
| input size | purego | asm |
|
||||
| ---------- | --------- | --------- |
|
||||
| 4 B | 1.3 GB/s | 1.2 GB/s |
|
||||
| 16 B | 2.9 GB/s | 3.5 GB/s |
|
||||
| 100 B | 6.9 GB/s | 8.1 GB/s |
|
||||
| 4 KB | 11.7 GB/s | 16.7 GB/s |
|
||||
| 10 MB | 12.0 GB/s | 17.3 GB/s |
|
||||
|
||||
These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
|
||||
the following commands under Go 1.11.2:
|
||||
These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
|
||||
CPU using the following commands under Go 1.19.2:
|
||||
|
||||
```
|
||||
$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
|
||||
$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
|
||||
benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
|
||||
benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
|
||||
```
|
||||
|
||||
## Projects using this package
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
#!/bin/bash
|
||||
set -eu -o pipefail
|
||||
|
||||
# Small convenience script for running the tests with various combinations of
|
||||
# arch/tags. This assumes we're running on amd64 and have qemu available.
|
||||
|
||||
go test ./...
|
||||
go test -tags purego ./...
|
||||
GOARCH=arm64 go test
|
||||
GOARCH=arm64 go test -tags purego
|
|
@ -16,19 +16,11 @@ const (
|
|||
prime5 uint64 = 2870177450012600261
|
||||
)
|
||||
|
||||
// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
|
||||
// possible in the Go code is worth a small (but measurable) performance boost
|
||||
// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
|
||||
// convenience in the Go code in a few places where we need to intentionally
|
||||
// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
|
||||
// result overflows a uint64).
|
||||
var (
|
||||
prime1v = prime1
|
||||
prime2v = prime2
|
||||
prime3v = prime3
|
||||
prime4v = prime4
|
||||
prime5v = prime5
|
||||
)
|
||||
// Store the primes in an array as well.
|
||||
//
|
||||
// The consts are used when possible in Go code to avoid MOVs but we need a
|
||||
// contiguous array of the assembly code.
|
||||
var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
|
||||
|
||||
// Digest implements hash.Hash64.
|
||||
type Digest struct {
|
||||
|
@ -50,10 +42,10 @@ func New() *Digest {
|
|||
|
||||
// Reset clears the Digest's state so that it can be reused.
|
||||
func (d *Digest) Reset() {
|
||||
d.v1 = prime1v + prime2
|
||||
d.v1 = primes[0] + prime2
|
||||
d.v2 = prime2
|
||||
d.v3 = 0
|
||||
d.v4 = -prime1v
|
||||
d.v4 = -primes[0]
|
||||
d.total = 0
|
||||
d.n = 0
|
||||
}
|
||||
|
@ -69,21 +61,23 @@ func (d *Digest) Write(b []byte) (n int, err error) {
|
|||
n = len(b)
|
||||
d.total += uint64(n)
|
||||
|
||||
memleft := d.mem[d.n&(len(d.mem)-1):]
|
||||
|
||||
if d.n+n < 32 {
|
||||
// This new data doesn't even fill the current block.
|
||||
copy(d.mem[d.n:], b)
|
||||
copy(memleft, b)
|
||||
d.n += n
|
||||
return
|
||||
}
|
||||
|
||||
if d.n > 0 {
|
||||
// Finish off the partial block.
|
||||
copy(d.mem[d.n:], b)
|
||||
c := copy(memleft, b)
|
||||
d.v1 = round(d.v1, u64(d.mem[0:8]))
|
||||
d.v2 = round(d.v2, u64(d.mem[8:16]))
|
||||
d.v3 = round(d.v3, u64(d.mem[16:24]))
|
||||
d.v4 = round(d.v4, u64(d.mem[24:32]))
|
||||
b = b[32-d.n:]
|
||||
b = b[c:]
|
||||
d.n = 0
|
||||
}
|
||||
|
||||
|
@ -133,21 +127,20 @@ func (d *Digest) Sum64() uint64 {
|
|||
|
||||
h += d.total
|
||||
|
||||
i, end := 0, d.n
|
||||
for ; i+8 <= end; i += 8 {
|
||||
k1 := round(0, u64(d.mem[i:i+8]))
|
||||
b := d.mem[:d.n&(len(d.mem)-1)]
|
||||
for ; len(b) >= 8; b = b[8:] {
|
||||
k1 := round(0, u64(b[:8]))
|
||||
h ^= k1
|
||||
h = rol27(h)*prime1 + prime4
|
||||
}
|
||||
if i+4 <= end {
|
||||
h ^= uint64(u32(d.mem[i:i+4])) * prime1
|
||||
if len(b) >= 4 {
|
||||
h ^= uint64(u32(b[:4])) * prime1
|
||||
h = rol23(h)*prime2 + prime3
|
||||
i += 4
|
||||
b = b[4:]
|
||||
}
|
||||
for i < end {
|
||||
h ^= uint64(d.mem[i]) * prime5
|
||||
for ; len(b) > 0; b = b[1:] {
|
||||
h ^= uint64(b[0]) * prime5
|
||||
h = rol11(h) * prime1
|
||||
i++
|
||||
}
|
||||
|
||||
h ^= h >> 33
|
||||
|
|
|
@ -1,215 +1,209 @@
|
|||
//go:build !appengine && gc && !purego
|
||||
// +build !appengine
|
||||
// +build gc
|
||||
// +build !purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// Register allocation:
|
||||
// AX h
|
||||
// SI pointer to advance through b
|
||||
// DX n
|
||||
// BX loop end
|
||||
// R8 v1, k1
|
||||
// R9 v2
|
||||
// R10 v3
|
||||
// R11 v4
|
||||
// R12 tmp
|
||||
// R13 prime1v
|
||||
// R14 prime2v
|
||||
// DI prime4v
|
||||
// Registers:
|
||||
#define h AX
|
||||
#define d AX
|
||||
#define p SI // pointer to advance through b
|
||||
#define n DX
|
||||
#define end BX // loop end
|
||||
#define v1 R8
|
||||
#define v2 R9
|
||||
#define v3 R10
|
||||
#define v4 R11
|
||||
#define x R12
|
||||
#define prime1 R13
|
||||
#define prime2 R14
|
||||
#define prime4 DI
|
||||
|
||||
// round reads from and advances the buffer pointer in SI.
|
||||
// It assumes that R13 has prime1v and R14 has prime2v.
|
||||
#define round(r) \
|
||||
MOVQ (SI), R12 \
|
||||
ADDQ $8, SI \
|
||||
IMULQ R14, R12 \
|
||||
ADDQ R12, r \
|
||||
ROLQ $31, r \
|
||||
IMULQ R13, r
|
||||
#define round(acc, x) \
|
||||
IMULQ prime2, x \
|
||||
ADDQ x, acc \
|
||||
ROLQ $31, acc \
|
||||
IMULQ prime1, acc
|
||||
|
||||
// mergeRound applies a merge round on the two registers acc and val.
|
||||
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
|
||||
#define mergeRound(acc, val) \
|
||||
IMULQ R14, val \
|
||||
ROLQ $31, val \
|
||||
IMULQ R13, val \
|
||||
XORQ val, acc \
|
||||
IMULQ R13, acc \
|
||||
ADDQ DI, acc
|
||||
// round0 performs the operation x = round(0, x).
|
||||
#define round0(x) \
|
||||
IMULQ prime2, x \
|
||||
ROLQ $31, x \
|
||||
IMULQ prime1, x
|
||||
|
||||
// mergeRound applies a merge round on the two registers acc and x.
|
||||
// It assumes that prime1, prime2, and prime4 have been loaded.
|
||||
#define mergeRound(acc, x) \
|
||||
round0(x) \
|
||||
XORQ x, acc \
|
||||
IMULQ prime1, acc \
|
||||
ADDQ prime4, acc
|
||||
|
||||
// blockLoop processes as many 32-byte blocks as possible,
|
||||
// updating v1, v2, v3, and v4. It assumes that there is at least one block
|
||||
// to process.
|
||||
#define blockLoop() \
|
||||
loop: \
|
||||
MOVQ +0(p), x \
|
||||
round(v1, x) \
|
||||
MOVQ +8(p), x \
|
||||
round(v2, x) \
|
||||
MOVQ +16(p), x \
|
||||
round(v3, x) \
|
||||
MOVQ +24(p), x \
|
||||
round(v4, x) \
|
||||
ADDQ $32, p \
|
||||
CMPQ p, end \
|
||||
JLE loop
|
||||
|
||||
// func Sum64(b []byte) uint64
|
||||
TEXT ·Sum64(SB), NOSPLIT, $0-32
|
||||
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
|
||||
// Load fixed primes.
|
||||
MOVQ ·prime1v(SB), R13
|
||||
MOVQ ·prime2v(SB), R14
|
||||
MOVQ ·prime4v(SB), DI
|
||||
MOVQ ·primes+0(SB), prime1
|
||||
MOVQ ·primes+8(SB), prime2
|
||||
MOVQ ·primes+24(SB), prime4
|
||||
|
||||
// Load slice.
|
||||
MOVQ b_base+0(FP), SI
|
||||
MOVQ b_len+8(FP), DX
|
||||
LEAQ (SI)(DX*1), BX
|
||||
MOVQ b_base+0(FP), p
|
||||
MOVQ b_len+8(FP), n
|
||||
LEAQ (p)(n*1), end
|
||||
|
||||
// The first loop limit will be len(b)-32.
|
||||
SUBQ $32, BX
|
||||
SUBQ $32, end
|
||||
|
||||
// Check whether we have at least one block.
|
||||
CMPQ DX, $32
|
||||
CMPQ n, $32
|
||||
JLT noBlocks
|
||||
|
||||
// Set up initial state (v1, v2, v3, v4).
|
||||
MOVQ R13, R8
|
||||
ADDQ R14, R8
|
||||
MOVQ R14, R9
|
||||
XORQ R10, R10
|
||||
XORQ R11, R11
|
||||
SUBQ R13, R11
|
||||
MOVQ prime1, v1
|
||||
ADDQ prime2, v1
|
||||
MOVQ prime2, v2
|
||||
XORQ v3, v3
|
||||
XORQ v4, v4
|
||||
SUBQ prime1, v4
|
||||
|
||||
// Loop until SI > BX.
|
||||
blockLoop:
|
||||
round(R8)
|
||||
round(R9)
|
||||
round(R10)
|
||||
round(R11)
|
||||
blockLoop()
|
||||
|
||||
CMPQ SI, BX
|
||||
JLE blockLoop
|
||||
MOVQ v1, h
|
||||
ROLQ $1, h
|
||||
MOVQ v2, x
|
||||
ROLQ $7, x
|
||||
ADDQ x, h
|
||||
MOVQ v3, x
|
||||
ROLQ $12, x
|
||||
ADDQ x, h
|
||||
MOVQ v4, x
|
||||
ROLQ $18, x
|
||||
ADDQ x, h
|
||||
|
||||
MOVQ R8, AX
|
||||
ROLQ $1, AX
|
||||
MOVQ R9, R12
|
||||
ROLQ $7, R12
|
||||
ADDQ R12, AX
|
||||
MOVQ R10, R12
|
||||
ROLQ $12, R12
|
||||
ADDQ R12, AX
|
||||
MOVQ R11, R12
|
||||
ROLQ $18, R12
|
||||
ADDQ R12, AX
|
||||
|
||||
mergeRound(AX, R8)
|
||||
mergeRound(AX, R9)
|
||||
mergeRound(AX, R10)
|
||||
mergeRound(AX, R11)
|
||||
mergeRound(h, v1)
|
||||
mergeRound(h, v2)
|
||||
mergeRound(h, v3)
|
||||
mergeRound(h, v4)
|
||||
|
||||
JMP afterBlocks
|
||||
|
||||
noBlocks:
|
||||
MOVQ ·prime5v(SB), AX
|
||||
MOVQ ·primes+32(SB), h
|
||||
|
||||
afterBlocks:
|
||||
ADDQ DX, AX
|
||||
ADDQ n, h
|
||||
|
||||
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
|
||||
ADDQ $24, BX
|
||||
ADDQ $24, end
|
||||
CMPQ p, end
|
||||
JG try4
|
||||
|
||||
CMPQ SI, BX
|
||||
JG fourByte
|
||||
loop8:
|
||||
MOVQ (p), x
|
||||
ADDQ $8, p
|
||||
round0(x)
|
||||
XORQ x, h
|
||||
ROLQ $27, h
|
||||
IMULQ prime1, h
|
||||
ADDQ prime4, h
|
||||
|
||||
wordLoop:
|
||||
// Calculate k1.
|
||||
MOVQ (SI), R8
|
||||
ADDQ $8, SI
|
||||
IMULQ R14, R8
|
||||
ROLQ $31, R8
|
||||
IMULQ R13, R8
|
||||
CMPQ p, end
|
||||
JLE loop8
|
||||
|
||||
XORQ R8, AX
|
||||
ROLQ $27, AX
|
||||
IMULQ R13, AX
|
||||
ADDQ DI, AX
|
||||
try4:
|
||||
ADDQ $4, end
|
||||
CMPQ p, end
|
||||
JG try1
|
||||
|
||||
CMPQ SI, BX
|
||||
JLE wordLoop
|
||||
MOVL (p), x
|
||||
ADDQ $4, p
|
||||
IMULQ prime1, x
|
||||
XORQ x, h
|
||||
|
||||
fourByte:
|
||||
ADDQ $4, BX
|
||||
CMPQ SI, BX
|
||||
JG singles
|
||||
ROLQ $23, h
|
||||
IMULQ prime2, h
|
||||
ADDQ ·primes+16(SB), h
|
||||
|
||||
MOVL (SI), R8
|
||||
ADDQ $4, SI
|
||||
IMULQ R13, R8
|
||||
XORQ R8, AX
|
||||
|
||||
ROLQ $23, AX
|
||||
IMULQ R14, AX
|
||||
ADDQ ·prime3v(SB), AX
|
||||
|
||||
singles:
|
||||
ADDQ $4, BX
|
||||
CMPQ SI, BX
|
||||
try1:
|
||||
ADDQ $4, end
|
||||
CMPQ p, end
|
||||
JGE finalize
|
||||
|
||||
singlesLoop:
|
||||
MOVBQZX (SI), R12
|
||||
ADDQ $1, SI
|
||||
IMULQ ·prime5v(SB), R12
|
||||
XORQ R12, AX
|
||||
loop1:
|
||||
MOVBQZX (p), x
|
||||
ADDQ $1, p
|
||||
IMULQ ·primes+32(SB), x
|
||||
XORQ x, h
|
||||
ROLQ $11, h
|
||||
IMULQ prime1, h
|
||||
|
||||
ROLQ $11, AX
|
||||
IMULQ R13, AX
|
||||
|
||||
CMPQ SI, BX
|
||||
JL singlesLoop
|
||||
CMPQ p, end
|
||||
JL loop1
|
||||
|
||||
finalize:
|
||||
MOVQ AX, R12
|
||||
SHRQ $33, R12
|
||||
XORQ R12, AX
|
||||
IMULQ R14, AX
|
||||
MOVQ AX, R12
|
||||
SHRQ $29, R12
|
||||
XORQ R12, AX
|
||||
IMULQ ·prime3v(SB), AX
|
||||
MOVQ AX, R12
|
||||
SHRQ $32, R12
|
||||
XORQ R12, AX
|
||||
MOVQ h, x
|
||||
SHRQ $33, x
|
||||
XORQ x, h
|
||||
IMULQ prime2, h
|
||||
MOVQ h, x
|
||||
SHRQ $29, x
|
||||
XORQ x, h
|
||||
IMULQ ·primes+16(SB), h
|
||||
MOVQ h, x
|
||||
SHRQ $32, x
|
||||
XORQ x, h
|
||||
|
||||
MOVQ AX, ret+24(FP)
|
||||
MOVQ h, ret+24(FP)
|
||||
RET
|
||||
|
||||
// writeBlocks uses the same registers as above except that it uses AX to store
|
||||
// the d pointer.
|
||||
|
||||
// func writeBlocks(d *Digest, b []byte) int
|
||||
TEXT ·writeBlocks(SB), NOSPLIT, $0-40
|
||||
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
|
||||
// Load fixed primes needed for round.
|
||||
MOVQ ·prime1v(SB), R13
|
||||
MOVQ ·prime2v(SB), R14
|
||||
MOVQ ·primes+0(SB), prime1
|
||||
MOVQ ·primes+8(SB), prime2
|
||||
|
||||
// Load slice.
|
||||
MOVQ b_base+8(FP), SI
|
||||
MOVQ b_len+16(FP), DX
|
||||
LEAQ (SI)(DX*1), BX
|
||||
SUBQ $32, BX
|
||||
MOVQ b_base+8(FP), p
|
||||
MOVQ b_len+16(FP), n
|
||||
LEAQ (p)(n*1), end
|
||||
SUBQ $32, end
|
||||
|
||||
// Load vN from d.
|
||||
MOVQ d+0(FP), AX
|
||||
MOVQ 0(AX), R8 // v1
|
||||
MOVQ 8(AX), R9 // v2
|
||||
MOVQ 16(AX), R10 // v3
|
||||
MOVQ 24(AX), R11 // v4
|
||||
MOVQ s+0(FP), d
|
||||
MOVQ 0(d), v1
|
||||
MOVQ 8(d), v2
|
||||
MOVQ 16(d), v3
|
||||
MOVQ 24(d), v4
|
||||
|
||||
// We don't need to check the loop condition here; this function is
|
||||
// always called with at least one block of data to process.
|
||||
blockLoop:
|
||||
round(R8)
|
||||
round(R9)
|
||||
round(R10)
|
||||
round(R11)
|
||||
|
||||
CMPQ SI, BX
|
||||
JLE blockLoop
|
||||
blockLoop()
|
||||
|
||||
// Copy vN back to d.
|
||||
MOVQ R8, 0(AX)
|
||||
MOVQ R9, 8(AX)
|
||||
MOVQ R10, 16(AX)
|
||||
MOVQ R11, 24(AX)
|
||||
MOVQ v1, 0(d)
|
||||
MOVQ v2, 8(d)
|
||||
MOVQ v3, 16(d)
|
||||
MOVQ v4, 24(d)
|
||||
|
||||
// The number of bytes written is SI minus the old base pointer.
|
||||
SUBQ b_base+8(FP), SI
|
||||
MOVQ SI, ret+32(FP)
|
||||
// The number of bytes written is p minus the old base pointer.
|
||||
SUBQ b_base+8(FP), p
|
||||
MOVQ p, ret+32(FP)
|
||||
|
||||
RET
|
||||
|
|
|
@ -0,0 +1,183 @@
|
|||
//go:build !appengine && gc && !purego
|
||||
// +build !appengine
|
||||
// +build gc
|
||||
// +build !purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// Registers:
|
||||
#define digest R1
|
||||
#define h R2 // return value
|
||||
#define p R3 // input pointer
|
||||
#define n R4 // input length
|
||||
#define nblocks R5 // n / 32
|
||||
#define prime1 R7
|
||||
#define prime2 R8
|
||||
#define prime3 R9
|
||||
#define prime4 R10
|
||||
#define prime5 R11
|
||||
#define v1 R12
|
||||
#define v2 R13
|
||||
#define v3 R14
|
||||
#define v4 R15
|
||||
#define x1 R20
|
||||
#define x2 R21
|
||||
#define x3 R22
|
||||
#define x4 R23
|
||||
|
||||
#define round(acc, x) \
|
||||
MADD prime2, acc, x, acc \
|
||||
ROR $64-31, acc \
|
||||
MUL prime1, acc
|
||||
|
||||
// round0 performs the operation x = round(0, x).
|
||||
#define round0(x) \
|
||||
MUL prime2, x \
|
||||
ROR $64-31, x \
|
||||
MUL prime1, x
|
||||
|
||||
#define mergeRound(acc, x) \
|
||||
round0(x) \
|
||||
EOR x, acc \
|
||||
MADD acc, prime4, prime1, acc
|
||||
|
||||
// blockLoop processes as many 32-byte blocks as possible,
|
||||
// updating v1, v2, v3, and v4. It assumes that n >= 32.
|
||||
#define blockLoop() \
|
||||
LSR $5, n, nblocks \
|
||||
PCALIGN $16 \
|
||||
loop: \
|
||||
LDP.P 16(p), (x1, x2) \
|
||||
LDP.P 16(p), (x3, x4) \
|
||||
round(v1, x1) \
|
||||
round(v2, x2) \
|
||||
round(v3, x3) \
|
||||
round(v4, x4) \
|
||||
SUB $1, nblocks \
|
||||
CBNZ nblocks, loop
|
||||
|
||||
// func Sum64(b []byte) uint64
|
||||
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
|
||||
LDP b_base+0(FP), (p, n)
|
||||
|
||||
LDP ·primes+0(SB), (prime1, prime2)
|
||||
LDP ·primes+16(SB), (prime3, prime4)
|
||||
MOVD ·primes+32(SB), prime5
|
||||
|
||||
CMP $32, n
|
||||
CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
|
||||
BLT afterLoop
|
||||
|
||||
ADD prime1, prime2, v1
|
||||
MOVD prime2, v2
|
||||
MOVD $0, v3
|
||||
NEG prime1, v4
|
||||
|
||||
blockLoop()
|
||||
|
||||
ROR $64-1, v1, x1
|
||||
ROR $64-7, v2, x2
|
||||
ADD x1, x2
|
||||
ROR $64-12, v3, x3
|
||||
ROR $64-18, v4, x4
|
||||
ADD x3, x4
|
||||
ADD x2, x4, h
|
||||
|
||||
mergeRound(h, v1)
|
||||
mergeRound(h, v2)
|
||||
mergeRound(h, v3)
|
||||
mergeRound(h, v4)
|
||||
|
||||
afterLoop:
|
||||
ADD n, h
|
||||
|
||||
TBZ $4, n, try8
|
||||
LDP.P 16(p), (x1, x2)
|
||||
|
||||
round0(x1)
|
||||
|
||||
// NOTE: here and below, sequencing the EOR after the ROR (using a
|
||||
// rotated register) is worth a small but measurable speedup for small
|
||||
// inputs.
|
||||
ROR $64-27, h
|
||||
EOR x1 @> 64-27, h, h
|
||||
MADD h, prime4, prime1, h
|
||||
|
||||
round0(x2)
|
||||
ROR $64-27, h
|
||||
EOR x2 @> 64-27, h, h
|
||||
MADD h, prime4, prime1, h
|
||||
|
||||
try8:
|
||||
TBZ $3, n, try4
|
||||
MOVD.P 8(p), x1
|
||||
|
||||
round0(x1)
|
||||
ROR $64-27, h
|
||||
EOR x1 @> 64-27, h, h
|
||||
MADD h, prime4, prime1, h
|
||||
|
||||
try4:
|
||||
TBZ $2, n, try2
|
||||
MOVWU.P 4(p), x2
|
||||
|
||||
MUL prime1, x2
|
||||
ROR $64-23, h
|
||||
EOR x2 @> 64-23, h, h
|
||||
MADD h, prime3, prime2, h
|
||||
|
||||
try2:
|
||||
TBZ $1, n, try1
|
||||
MOVHU.P 2(p), x3
|
||||
AND $255, x3, x1
|
||||
LSR $8, x3, x2
|
||||
|
||||
MUL prime5, x1
|
||||
ROR $64-11, h
|
||||
EOR x1 @> 64-11, h, h
|
||||
MUL prime1, h
|
||||
|
||||
MUL prime5, x2
|
||||
ROR $64-11, h
|
||||
EOR x2 @> 64-11, h, h
|
||||
MUL prime1, h
|
||||
|
||||
try1:
|
||||
TBZ $0, n, finalize
|
||||
MOVBU (p), x4
|
||||
|
||||
MUL prime5, x4
|
||||
ROR $64-11, h
|
||||
EOR x4 @> 64-11, h, h
|
||||
MUL prime1, h
|
||||
|
||||
finalize:
|
||||
EOR h >> 33, h
|
||||
MUL prime2, h
|
||||
EOR h >> 29, h
|
||||
MUL prime3, h
|
||||
EOR h >> 32, h
|
||||
|
||||
MOVD h, ret+24(FP)
|
||||
RET
|
||||
|
||||
// func writeBlocks(d *Digest, b []byte) int
|
||||
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
|
||||
LDP ·primes+0(SB), (prime1, prime2)
|
||||
|
||||
// Load state. Assume v[1-4] are stored contiguously.
|
||||
MOVD d+0(FP), digest
|
||||
LDP 0(digest), (v1, v2)
|
||||
LDP 16(digest), (v3, v4)
|
||||
|
||||
LDP b_base+8(FP), (p, n)
|
||||
|
||||
blockLoop()
|
||||
|
||||
// Store updated state.
|
||||
STP (v1, v2), 0(digest)
|
||||
STP (v3, v4), 16(digest)
|
||||
|
||||
BIC $31, n
|
||||
MOVD n, ret+32(FP)
|
||||
RET
|
|
@ -1,3 +1,5 @@
|
|||
//go:build (amd64 || arm64) && !appengine && gc && !purego
|
||||
// +build amd64 arm64
|
||||
// +build !appengine
|
||||
// +build gc
|
||||
// +build !purego
|
|
@ -1,4 +1,5 @@
|
|||
// +build !amd64 appengine !gc purego
|
||||
//go:build (!amd64 && !arm64) || appengine || !gc || purego
|
||||
// +build !amd64,!arm64 appengine !gc purego
|
||||
|
||||
package xxhash
|
||||
|
||||
|
@ -14,10 +15,10 @@ func Sum64(b []byte) uint64 {
|
|||
var h uint64
|
||||
|
||||
if n >= 32 {
|
||||
v1 := prime1v + prime2
|
||||
v1 := primes[0] + prime2
|
||||
v2 := prime2
|
||||
v3 := uint64(0)
|
||||
v4 := -prime1v
|
||||
v4 := -primes[0]
|
||||
for len(b) >= 32 {
|
||||
v1 = round(v1, u64(b[0:8:len(b)]))
|
||||
v2 = round(v2, u64(b[8:16:len(b)]))
|
||||
|
@ -36,19 +37,18 @@ func Sum64(b []byte) uint64 {
|
|||
|
||||
h += uint64(n)
|
||||
|
||||
i, end := 0, len(b)
|
||||
for ; i+8 <= end; i += 8 {
|
||||
k1 := round(0, u64(b[i:i+8:len(b)]))
|
||||
for ; len(b) >= 8; b = b[8:] {
|
||||
k1 := round(0, u64(b[:8]))
|
||||
h ^= k1
|
||||
h = rol27(h)*prime1 + prime4
|
||||
}
|
||||
if i+4 <= end {
|
||||
h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
|
||||
if len(b) >= 4 {
|
||||
h ^= uint64(u32(b[:4])) * prime1
|
||||
h = rol23(h)*prime2 + prime3
|
||||
i += 4
|
||||
b = b[4:]
|
||||
}
|
||||
for ; i < end; i++ {
|
||||
h ^= uint64(b[i]) * prime5
|
||||
for ; len(b) > 0; b = b[1:] {
|
||||
h ^= uint64(b[0]) * prime5
|
||||
h = rol11(h) * prime1
|
||||
}
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build appengine
|
||||
// +build appengine
|
||||
|
||||
// This file contains the safe implementations of otherwise unsafe-using code.
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//go:build !appengine
|
||||
// +build !appengine
|
||||
|
||||
// This file encapsulates usage of unsafe.
|
||||
|
@ -11,7 +12,7 @@ import (
|
|||
|
||||
// In the future it's possible that compiler optimizations will make these
|
||||
// XxxString functions unnecessary by realizing that calls such as
|
||||
// Sum64([]byte(s)) don't need to copy s. See https://golang.org/issue/2205.
|
||||
// Sum64([]byte(s)) don't need to copy s. See https://go.dev/issue/2205.
|
||||
// If that happens, even if we keep these functions they can be replaced with
|
||||
// the trivial safe code.
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ github.com/Microsoft/go-winio/pkg/guid
|
|||
# github.com/beorn7/perks v1.0.1
|
||||
## explicit; go 1.11
|
||||
github.com/beorn7/perks/quantile
|
||||
# github.com/cespare/xxhash/v2 v2.1.2
|
||||
# github.com/cespare/xxhash/v2 v2.2.0
|
||||
## explicit; go 1.11
|
||||
github.com/cespare/xxhash/v2
|
||||
# github.com/container-orchestrated-devices/container-device-interface v0.6.0
|
||||
|
|
Loading…
Reference in New Issue