vendor: github.com/klauspost/compress v1.16.3

full diff: https://github.com/klauspost/compress/compare/v1.15.12...v1.16.3

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
Sebastiaan van Stijn 2023-03-28 16:43:43 +02:00
parent 5843fbd5f5
commit cd9c6a4c02
No known key found for this signature in database
GPG Key ID: 76698F39D527CE8C
37 changed files with 1021 additions and 899 deletions

View File

@ -57,7 +57,7 @@ require (
github.com/golang/protobuf v1.5.2 // indirect github.com/golang/protobuf v1.5.2 // indirect
github.com/gorilla/mux v1.8.0 // indirect github.com/gorilla/mux v1.8.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/klauspost/compress v1.15.12 // indirect github.com/klauspost/compress v1.16.3 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/miekg/pkcs11 v1.1.1 // indirect github.com/miekg/pkcs11 v1.1.1 // indirect
github.com/moby/sys/symlink v0.2.0 // indirect github.com/moby/sys/symlink v0.2.0 // indirect

View File

@ -252,8 +252,8 @@ github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8
github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.15.12 h1:YClS/PImqYbn+UILDnqxQCZ3RehC9N318SU3kElDUEM= github.com/klauspost/compress v1.16.3 h1:XuJt9zzcnaz6a16/OU53ZjWp/v7/42WcR5t2a0PcNQY=
github.com/klauspost/compress v1.15.12/go.mod h1:QPwzmACJjUTFsnSHH934V6woptycfrDDJnH7hvFVbGM= github.com/klauspost/compress v1.16.3/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=

View File

@ -3,7 +3,7 @@
before: before:
hooks: hooks:
- ./gen.sh - ./gen.sh
- go install mvdan.cc/garble@latest - go install mvdan.cc/garble@v0.9.3
builds: builds:
- -

View File

@ -9,7 +9,6 @@ This package provides various compression algorithms.
* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding. * [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
* [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently. * [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently.
* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation. * [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.
* [fuzz package](https://github.com/klauspost/compress-fuzz) for fuzz testing all compressors/decompressors here.
[![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories) [![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
[![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml) [![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
@ -17,6 +16,43 @@ This package provides various compression algorithms.
# changelog # changelog
* Mar 13, 2023 - [v1.16.1](https://github.com/klauspost/compress/releases/tag/v1.16.1)
* zstd: Speed up + improve best encoder by @greatroar in https://github.com/klauspost/compress/pull/776
* gzhttp: Add optional [BREACH mitigation](https://github.com/klauspost/compress/tree/master/gzhttp#breach-mitigation). https://github.com/klauspost/compress/pull/762 https://github.com/klauspost/compress/pull/768 https://github.com/klauspost/compress/pull/769 https://github.com/klauspost/compress/pull/770 https://github.com/klauspost/compress/pull/767
* s2: Add Intel LZ4s converter https://github.com/klauspost/compress/pull/766
* zstd: Minor bug fixes https://github.com/klauspost/compress/pull/771 https://github.com/klauspost/compress/pull/772 https://github.com/klauspost/compress/pull/773
* huff0: Speed up compress1xDo by @greatroar in https://github.com/klauspost/compress/pull/774
* Feb 26, 2023 - [v1.16.0](https://github.com/klauspost/compress/releases/tag/v1.16.0)
* s2: Add [Dictionary](https://github.com/klauspost/compress/tree/master/s2#dictionaries) support. https://github.com/klauspost/compress/pull/685
* s2: Add Compression Size Estimate. https://github.com/klauspost/compress/pull/752
* s2: Add support for custom stream encoder. https://github.com/klauspost/compress/pull/755
* s2: Add LZ4 block converter. https://github.com/klauspost/compress/pull/748
* s2: Support io.ReaderAt in ReadSeeker. https://github.com/klauspost/compress/pull/747
* s2c/s2sx: Use concurrent decoding. https://github.com/klauspost/compress/pull/746
* Jan 21st, 2023 (v1.15.15)
* deflate: Improve level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/739
* zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728
* zstd: Various speed improvements by @greatroar https://github.com/klauspost/compress/pull/741 https://github.com/klauspost/compress/pull/734 https://github.com/klauspost/compress/pull/736 https://github.com/klauspost/compress/pull/744 https://github.com/klauspost/compress/pull/743 https://github.com/klauspost/compress/pull/745
* gzhttp: Add SuffixETag() and DropETag() options to prevent ETag collisions on compressed responses by @willbicks in https://github.com/klauspost/compress/pull/740
* Jan 3rd, 2023 (v1.15.14)
* flate: Improve speed in big stateless blocks https://github.com/klauspost/compress/pull/718
* zstd: Minor speed tweaks by @greatroar in https://github.com/klauspost/compress/pull/716 https://github.com/klauspost/compress/pull/720
* export NoGzipResponseWriter for custom ResponseWriter wrappers by @harshavardhana in https://github.com/klauspost/compress/pull/722
* s2: Add example for indexing and existing stream https://github.com/klauspost/compress/pull/723
* Dec 11, 2022 (v1.15.13)
* zstd: Add [MaxEncodedSize](https://pkg.go.dev/github.com/klauspost/compress@v1.15.13/zstd#Encoder.MaxEncodedSize) to encoder https://github.com/klauspost/compress/pull/691
* zstd: Various tweaks and improvements https://github.com/klauspost/compress/pull/693 https://github.com/klauspost/compress/pull/695 https://github.com/klauspost/compress/pull/696 https://github.com/klauspost/compress/pull/701 https://github.com/klauspost/compress/pull/702 https://github.com/klauspost/compress/pull/703 https://github.com/klauspost/compress/pull/704 https://github.com/klauspost/compress/pull/705 https://github.com/klauspost/compress/pull/706 https://github.com/klauspost/compress/pull/707 https://github.com/klauspost/compress/pull/708
* Oct 26, 2022 (v1.15.12)
* zstd: Tweak decoder allocs. https://github.com/klauspost/compress/pull/680
* gzhttp: Always delete `HeaderNoCompression` https://github.com/klauspost/compress/pull/683
* Sept 26, 2022 (v1.15.11) * Sept 26, 2022 (v1.15.11)
* flate: Improve level 1-3 compression https://github.com/klauspost/compress/pull/678 * flate: Improve level 1-3 compression https://github.com/klauspost/compress/pull/678

View File

@ -146,54 +146,51 @@ func (s *Scratch) compress(src []byte) error {
c1.encodeZero(tt[src[ip-2]]) c1.encodeZero(tt[src[ip-2]])
ip -= 2 ip -= 2
} }
src = src[:ip]
// Main compression loop. // Main compression loop.
switch { switch {
case !s.zeroBits && s.actualTableLog <= 8: case !s.zeroBits && s.actualTableLog <= 8:
// We can encode 4 symbols without requiring a flush. // We can encode 4 symbols without requiring a flush.
// We do not need to check if any output is 0 bits. // We do not need to check if any output is 0 bits.
for ip >= 4 { for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32() s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1] v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encode(tt[v0]) c2.encode(tt[v0])
c1.encode(tt[v1]) c1.encode(tt[v1])
c2.encode(tt[v2]) c2.encode(tt[v2])
c1.encode(tt[v3]) c1.encode(tt[v3])
ip -= 4
} }
case !s.zeroBits: case !s.zeroBits:
// We do not need to check if any output is 0 bits. // We do not need to check if any output is 0 bits.
for ip >= 4 { for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32() s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1] v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encode(tt[v0]) c2.encode(tt[v0])
c1.encode(tt[v1]) c1.encode(tt[v1])
s.bw.flush32() s.bw.flush32()
c2.encode(tt[v2]) c2.encode(tt[v2])
c1.encode(tt[v3]) c1.encode(tt[v3])
ip -= 4
} }
case s.actualTableLog <= 8: case s.actualTableLog <= 8:
// We can encode 4 symbols without requiring a flush // We can encode 4 symbols without requiring a flush
for ip >= 4 { for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32() s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1] v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encodeZero(tt[v0]) c2.encodeZero(tt[v0])
c1.encodeZero(tt[v1]) c1.encodeZero(tt[v1])
c2.encodeZero(tt[v2]) c2.encodeZero(tt[v2])
c1.encodeZero(tt[v3]) c1.encodeZero(tt[v3])
ip -= 4
} }
default: default:
for ip >= 4 { for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32() s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1] v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encodeZero(tt[v0]) c2.encodeZero(tt[v0])
c1.encodeZero(tt[v1]) c1.encodeZero(tt[v1])
s.bw.flush32() s.bw.flush32()
c2.encodeZero(tt[v2]) c2.encodeZero(tt[v2])
c1.encodeZero(tt[v3]) c1.encodeZero(tt[v3])
ip -= 4
} }
} }
@ -459,15 +456,17 @@ func (s *Scratch) countSimple(in []byte) (max int) {
for _, v := range in { for _, v := range in {
s.count[v]++ s.count[v]++
} }
m := uint32(0) m, symlen := uint32(0), s.symbolLen
for i, v := range s.count[:] { for i, v := range s.count[:] {
if v == 0 {
continue
}
if v > m { if v > m {
m = v m = v
} }
if v > 0 { symlen = uint16(i) + 1
s.symbolLen = uint16(i) + 1
}
} }
s.symbolLen = symlen
return int(m) return int(m)
} }

View File

@ -260,7 +260,9 @@ func (s *Scratch) buildDtable() error {
// If the buffer is over-read an error is returned. // If the buffer is over-read an error is returned.
func (s *Scratch) decompress() error { func (s *Scratch) decompress() error {
br := &s.bits br := &s.bits
br.init(s.br.unread()) if err := br.init(s.br.unread()); err != nil {
return err
}
var s1, s2 decoder var s1, s2 decoder
// Initialize and decode first state and symbol. // Initialize and decode first state and symbol.

View File

@ -67,7 +67,6 @@ func (b *bitReaderBytes) fillFast() {
// 2 bounds checks. // 2 bounds checks.
v := b.in[b.off-4 : b.off] v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << (b.bitsRead - 32) b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32 b.bitsRead -= 32
@ -88,8 +87,7 @@ func (b *bitReaderBytes) fill() {
return return
} }
if b.off > 4 { if b.off > 4 {
v := b.in[b.off-4:] v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << (b.bitsRead - 32) b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32 b.bitsRead -= 32
@ -179,7 +177,6 @@ func (b *bitReaderShifted) fillFast() {
// 2 bounds checks. // 2 bounds checks.
v := b.in[b.off-4 : b.off] v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63) b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32 b.bitsRead -= 32
@ -200,8 +197,7 @@ func (b *bitReaderShifted) fill() {
return return
} }
if b.off > 4 { if b.off > 4 {
v := b.in[b.off-4:] v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63) b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32 b.bitsRead -= 32

View File

@ -60,6 +60,22 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
b.nBits += encA.nBits + encB.nBits b.nBits += encA.nBits + encB.nBits
} }
// encFourSymbols adds up to 32 bits from four symbols.
// It will not check if there is space for them,
// so the caller must ensure that b has been flushed recently.
func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
bitsA := encA.nBits
bitsB := bitsA + encB.nBits
bitsC := bitsB + encC.nBits
bitsD := bitsC + encD.nBits
combined := uint64(encA.val) |
(uint64(encB.val) << (bitsA & 63)) |
(uint64(encC.val) << (bitsB & 63)) |
(uint64(encD.val) << (bitsC & 63))
b.bitContainer |= combined << (b.nBits & 63)
b.nBits += bitsD
}
// flush32 will flush out, so there are at least 32 bits available for writing. // flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() { func (b *bitWriter) flush32() {
if b.nBits < 32 { if b.nBits < 32 {

View File

@ -248,8 +248,7 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
tmp := src[n : n+4] tmp := src[n : n+4]
// tmp should be len 4 // tmp should be len 4
bw.flush32() bw.flush32()
bw.encTwoSymbols(cTable, tmp[3], tmp[2]) bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
bw.encTwoSymbols(cTable, tmp[1], tmp[0])
} }
} else { } else {
for ; n >= 0; n -= 4 { for ; n >= 0; n -= 4 {
@ -365,29 +364,29 @@ func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
m := uint32(0) m := uint32(0)
if len(s.prevTable) > 0 { if len(s.prevTable) > 0 {
for i, v := range s.count[:] { for i, v := range s.count[:] {
if v == 0 {
continue
}
if v > m { if v > m {
m = v m = v
} }
if v > 0 { s.symbolLen = uint16(i) + 1
s.symbolLen = uint16(i) + 1 if i >= len(s.prevTable) {
if i >= len(s.prevTable) { reuse = false
reuse = false } else if s.prevTable[i].nBits == 0 {
} else { reuse = false
if s.prevTable[i].nBits == 0 {
reuse = false
}
}
} }
} }
return int(m), reuse return int(m), reuse
} }
for i, v := range s.count[:] { for i, v := range s.count[:] {
if v == 0 {
continue
}
if v > m { if v > m {
m = v m = v
} }
if v > 0 { s.symbolLen = uint16(i) + 1
s.symbolLen = uint16(i) + 1
}
} }
return int(m), false return int(m), false
} }
@ -484,34 +483,35 @@ func (s *Scratch) buildCTable() error {
// Different from reference implementation. // Different from reference implementation.
huffNode0 := s.nodes[0 : huffNodesLen+1] huffNode0 := s.nodes[0 : huffNodesLen+1]
for huffNode[nonNullRank].count == 0 { for huffNode[nonNullRank].count() == 0 {
nonNullRank-- nonNullRank--
} }
lowS := int16(nonNullRank) lowS := int16(nonNullRank)
nodeRoot := nodeNb + lowS - 1 nodeRoot := nodeNb + lowS - 1
lowN := nodeNb lowN := nodeNb
huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb) huffNode[lowS].setParent(nodeNb)
huffNode[lowS-1].setParent(nodeNb)
nodeNb++ nodeNb++
lowS -= 2 lowS -= 2
for n := nodeNb; n <= nodeRoot; n++ { for n := nodeNb; n <= nodeRoot; n++ {
huffNode[n].count = 1 << 30 huffNode[n].setCount(1 << 30)
} }
// fake entry, strong barrier // fake entry, strong barrier
huffNode0[0].count = 1 << 31 huffNode0[0].setCount(1 << 31)
// create parents // create parents
for nodeNb <= nodeRoot { for nodeNb <= nodeRoot {
var n1, n2 int16 var n1, n2 int16
if huffNode0[lowS+1].count < huffNode0[lowN+1].count { if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n1 = lowS n1 = lowS
lowS-- lowS--
} else { } else {
n1 = lowN n1 = lowN
lowN++ lowN++
} }
if huffNode0[lowS+1].count < huffNode0[lowN+1].count { if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n2 = lowS n2 = lowS
lowS-- lowS--
} else { } else {
@ -519,18 +519,19 @@ func (s *Scratch) buildCTable() error {
lowN++ lowN++
} }
huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb) huffNode0[n1+1].setParent(nodeNb)
huffNode0[n2+1].setParent(nodeNb)
nodeNb++ nodeNb++
} }
// distribute weights (unlimited tree height) // distribute weights (unlimited tree height)
huffNode[nodeRoot].nbBits = 0 huffNode[nodeRoot].setNbBits(0)
for n := nodeRoot - 1; n >= startNode; n-- { for n := nodeRoot - 1; n >= startNode; n-- {
huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1 huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
} }
for n := uint16(0); n <= nonNullRank; n++ { for n := uint16(0); n <= nonNullRank; n++ {
huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1 huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
} }
s.actualTableLog = s.setMaxHeight(int(nonNullRank)) s.actualTableLog = s.setMaxHeight(int(nonNullRank))
maxNbBits := s.actualTableLog maxNbBits := s.actualTableLog
@ -542,7 +543,7 @@ func (s *Scratch) buildCTable() error {
var nbPerRank [tableLogMax + 1]uint16 var nbPerRank [tableLogMax + 1]uint16
var valPerRank [16]uint16 var valPerRank [16]uint16
for _, v := range huffNode[:nonNullRank+1] { for _, v := range huffNode[:nonNullRank+1] {
nbPerRank[v.nbBits]++ nbPerRank[v.nbBits()]++
} }
// determine stating value per rank // determine stating value per rank
{ {
@ -557,7 +558,7 @@ func (s *Scratch) buildCTable() error {
// push nbBits per symbol, symbol order // push nbBits per symbol, symbol order
for _, v := range huffNode[:nonNullRank+1] { for _, v := range huffNode[:nonNullRank+1] {
s.cTable[v.symbol].nBits = v.nbBits s.cTable[v.symbol()].nBits = v.nbBits()
} }
// assign value within rank, symbol order // assign value within rank, symbol order
@ -603,12 +604,12 @@ func (s *Scratch) huffSort() {
pos := rank[r].current pos := rank[r].current
rank[r].current++ rank[r].current++
prev := nodes[(pos-1)&huffNodesMask] prev := nodes[(pos-1)&huffNodesMask]
for pos > rank[r].base && c > prev.count { for pos > rank[r].base && c > prev.count() {
nodes[pos&huffNodesMask] = prev nodes[pos&huffNodesMask] = prev
pos-- pos--
prev = nodes[(pos-1)&huffNodesMask] prev = nodes[(pos-1)&huffNodesMask]
} }
nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)} nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
} }
} }
@ -617,7 +618,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
huffNode := s.nodes[1 : huffNodesLen+1] huffNode := s.nodes[1 : huffNodesLen+1]
//huffNode = huffNode[: huffNodesLen] //huffNode = huffNode[: huffNodesLen]
largestBits := huffNode[lastNonNull].nbBits largestBits := huffNode[lastNonNull].nbBits()
// early exit : no elt > maxNbBits // early exit : no elt > maxNbBits
if largestBits <= maxNbBits { if largestBits <= maxNbBits {
@ -627,14 +628,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
baseCost := int(1) << (largestBits - maxNbBits) baseCost := int(1) << (largestBits - maxNbBits)
n := uint32(lastNonNull) n := uint32(lastNonNull)
for huffNode[n].nbBits > maxNbBits { for huffNode[n].nbBits() > maxNbBits {
totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)) totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
huffNode[n].nbBits = maxNbBits huffNode[n].setNbBits(maxNbBits)
n-- n--
} }
// n stops at huffNode[n].nbBits <= maxNbBits // n stops at huffNode[n].nbBits <= maxNbBits
for huffNode[n].nbBits == maxNbBits { for huffNode[n].nbBits() == maxNbBits {
n-- n--
} }
// n end at index of smallest symbol using < maxNbBits // n end at index of smallest symbol using < maxNbBits
@ -655,10 +656,10 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
{ {
currentNbBits := maxNbBits currentNbBits := maxNbBits
for pos := int(n); pos >= 0; pos-- { for pos := int(n); pos >= 0; pos-- {
if huffNode[pos].nbBits >= currentNbBits { if huffNode[pos].nbBits() >= currentNbBits {
continue continue
} }
currentNbBits = huffNode[pos].nbBits // < maxNbBits currentNbBits = huffNode[pos].nbBits() // < maxNbBits
rankLast[maxNbBits-currentNbBits] = uint32(pos) rankLast[maxNbBits-currentNbBits] = uint32(pos)
} }
} }
@ -675,8 +676,8 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
if lowPos == noSymbol { if lowPos == noSymbol {
break break
} }
highTotal := huffNode[highPos].count highTotal := huffNode[highPos].count()
lowTotal := 2 * huffNode[lowPos].count lowTotal := 2 * huffNode[lowPos].count()
if highTotal <= lowTotal { if highTotal <= lowTotal {
break break
} }
@ -692,13 +693,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
// this rank is no longer empty // this rank is no longer empty
rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease] rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
} }
huffNode[rankLast[nBitsToDecrease]].nbBits++ huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
huffNode[rankLast[nBitsToDecrease]].nbBits())
if rankLast[nBitsToDecrease] == 0 { if rankLast[nBitsToDecrease] == 0 {
/* special case, reached largest symbol */ /* special case, reached largest symbol */
rankLast[nBitsToDecrease] = noSymbol rankLast[nBitsToDecrease] = noSymbol
} else { } else {
rankLast[nBitsToDecrease]-- rankLast[nBitsToDecrease]--
if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease { if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */ rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
} }
} }
@ -706,15 +708,15 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
for totalCost < 0 { /* Sometimes, cost correction overshoot */ for totalCost < 0 { /* Sometimes, cost correction overshoot */
if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */ if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
for huffNode[n].nbBits == maxNbBits { for huffNode[n].nbBits() == maxNbBits {
n-- n--
} }
huffNode[n+1].nbBits-- huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
rankLast[1] = n + 1 rankLast[1] = n + 1
totalCost++ totalCost++
continue continue
} }
huffNode[rankLast[1]+1].nbBits-- huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
rankLast[1]++ rankLast[1]++
totalCost++ totalCost++
} }
@ -722,9 +724,26 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
return maxNbBits return maxNbBits
} }
type nodeElt struct { // A nodeElt is the fields
count uint32 //
parent uint16 // count uint32
symbol byte // parent uint16
nbBits uint8 // symbol byte
// nbBits uint8
//
// in some order, all squashed into an integer so that the compiler
// always loads and stores entire nodeElts instead of separate fields.
type nodeElt uint64
func makeNodeElt(count uint32, symbol byte) nodeElt {
return nodeElt(count) | nodeElt(symbol)<<48
} }
func (e *nodeElt) count() uint32 { return uint32(*e) }
func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
func (e *nodeElt) symbol() byte { return byte(*e >> 48) }
func (e *nodeElt) nbBits() uint8 { return uint8(*e >> 56) }
func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }

View File

@ -61,7 +61,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
b, err := fse.Decompress(in[:iSize], s.fse) b, err := fse.Decompress(in[:iSize], s.fse)
s.fse.Out = nil s.fse.Out = nil
if err != nil { if err != nil {
return s, nil, err return s, nil, fmt.Errorf("fse decompress returned: %w", err)
} }
if len(b) > 255 { if len(b) > 255 {
return s, nil, errors.New("corrupt input: output table too large") return s, nil, errors.New("corrupt input: output table too large")

View File

@ -4,360 +4,349 @@
// func decompress4x_main_loop_amd64(ctx *decompress4xContext) // func decompress4x_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_main_loop_amd64(SB), $0-8 TEXT ·decompress4x_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values // Preload values
MOVQ ctx+0(FP), AX MOVQ ctx+0(FP), AX
MOVBQZX 8(AX), DI MOVBQZX 8(AX), DI
MOVQ 16(AX), SI MOVQ 16(AX), BX
MOVQ 48(AX), BX MOVQ 48(AX), SI
MOVQ 24(AX), R9 MOVQ 24(AX), R8
MOVQ 32(AX), R10 MOVQ 32(AX), R9
MOVQ (AX), R11 MOVQ (AX), R10
// Main loop // Main loop
main_loop: main_loop:
MOVQ SI, R8 XORL DX, DX
CMPQ R8, BX CMPQ BX, SI
SETGE DL SETGE DL
// br0.fillFast32() // br0.fillFast32()
MOVQ 32(R11), R12 MOVQ 32(R10), R11
MOVBQZX 40(R11), R13 MOVBQZX 40(R10), R12
CMPQ R13, $0x20 CMPQ R12, $0x20
JBE skip_fill0 JBE skip_fill0
MOVQ 24(R11), AX MOVQ 24(R10), AX
SUBQ $0x20, R13 SUBQ $0x20, R12
SUBQ $0x04, AX SUBQ $0x04, AX
MOVQ (R11), R14 MOVQ (R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63) // b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14 MOVL (AX)(R13*1), R13
MOVQ R13, CX MOVQ R12, CX
SHLQ CL, R14 SHLQ CL, R13
MOVQ AX, 24(R11) MOVQ AX, 24(R10)
ORQ R14, R12 ORQ R13, R11
// exhausted = exhausted || (br0.off < 4) // exhausted += (br0.off < 4)
CMPQ AX, $0x04 CMPQ AX, $0x04
SETLT AL ADCB $+0, DL
ORB AL, DL
skip_fill0: skip_fill0:
// val0 := br0.peekTopBits(peekBits) // val0 := br0.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v0 := table[val0&mask] // v0 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br0.advance(uint8(v0.entry) // br0.advance(uint8(v0.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val1 := br0.peekTopBits(peekBits) // val1 := br0.peekTopBits(peekBits)
MOVQ DI, CX MOVQ DI, CX
MOVQ R12, R14 MOVQ R11, R13
SHRQ CL, R14 SHRQ CL, R13
// v1 := table[val1&mask] // v1 := table[val1&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br0.advance(uint8(v1.entry)) // br0.advance(uint8(v1.entry))
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// these two writes get coalesced // these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8) MOVW AX, (BX)
// update the bitreader structure // update the bitreader structure
MOVQ R12, 32(R11) MOVQ R11, 32(R10)
MOVB R13, 40(R11) MOVB R12, 40(R10)
ADDQ R9, R8
// br1.fillFast32() // br1.fillFast32()
MOVQ 80(R11), R12 MOVQ 80(R10), R11
MOVBQZX 88(R11), R13 MOVBQZX 88(R10), R12
CMPQ R13, $0x20 CMPQ R12, $0x20
JBE skip_fill1 JBE skip_fill1
MOVQ 72(R11), AX MOVQ 72(R10), AX
SUBQ $0x20, R13 SUBQ $0x20, R12
SUBQ $0x04, AX SUBQ $0x04, AX
MOVQ 48(R11), R14 MOVQ 48(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63) // b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14 MOVL (AX)(R13*1), R13
MOVQ R13, CX MOVQ R12, CX
SHLQ CL, R14 SHLQ CL, R13
MOVQ AX, 72(R11) MOVQ AX, 72(R10)
ORQ R14, R12 ORQ R13, R11
// exhausted = exhausted || (br1.off < 4) // exhausted += (br1.off < 4)
CMPQ AX, $0x04 CMPQ AX, $0x04
SETLT AL ADCB $+0, DL
ORB AL, DL
skip_fill1: skip_fill1:
// val0 := br1.peekTopBits(peekBits) // val0 := br1.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v0 := table[val0&mask] // v0 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br1.advance(uint8(v0.entry) // br1.advance(uint8(v0.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val1 := br1.peekTopBits(peekBits) // val1 := br1.peekTopBits(peekBits)
MOVQ DI, CX MOVQ DI, CX
MOVQ R12, R14 MOVQ R11, R13
SHRQ CL, R14 SHRQ CL, R13
// v1 := table[val1&mask] // v1 := table[val1&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br1.advance(uint8(v1.entry)) // br1.advance(uint8(v1.entry))
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// these two writes get coalesced // these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8) MOVW AX, (BX)(R8*1)
// update the bitreader structure // update the bitreader structure
MOVQ R12, 80(R11) MOVQ R11, 80(R10)
MOVB R13, 88(R11) MOVB R12, 88(R10)
ADDQ R9, R8
// br2.fillFast32() // br2.fillFast32()
MOVQ 128(R11), R12 MOVQ 128(R10), R11
MOVBQZX 136(R11), R13 MOVBQZX 136(R10), R12
CMPQ R13, $0x20 CMPQ R12, $0x20
JBE skip_fill2 JBE skip_fill2
MOVQ 120(R11), AX MOVQ 120(R10), AX
SUBQ $0x20, R13 SUBQ $0x20, R12
SUBQ $0x04, AX SUBQ $0x04, AX
MOVQ 96(R11), R14 MOVQ 96(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63) // b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14 MOVL (AX)(R13*1), R13
MOVQ R13, CX MOVQ R12, CX
SHLQ CL, R14 SHLQ CL, R13
MOVQ AX, 120(R11) MOVQ AX, 120(R10)
ORQ R14, R12 ORQ R13, R11
// exhausted = exhausted || (br2.off < 4) // exhausted += (br2.off < 4)
CMPQ AX, $0x04 CMPQ AX, $0x04
SETLT AL ADCB $+0, DL
ORB AL, DL
skip_fill2: skip_fill2:
// val0 := br2.peekTopBits(peekBits) // val0 := br2.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v0 := table[val0&mask] // v0 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br2.advance(uint8(v0.entry) // br2.advance(uint8(v0.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val1 := br2.peekTopBits(peekBits) // val1 := br2.peekTopBits(peekBits)
MOVQ DI, CX MOVQ DI, CX
MOVQ R12, R14 MOVQ R11, R13
SHRQ CL, R14 SHRQ CL, R13
// v1 := table[val1&mask] // v1 := table[val1&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br2.advance(uint8(v1.entry)) // br2.advance(uint8(v1.entry))
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// these two writes get coalesced // these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8) MOVW AX, (BX)(R8*2)
// update the bitreader structure // update the bitreader structure
MOVQ R12, 128(R11) MOVQ R11, 128(R10)
MOVB R13, 136(R11) MOVB R12, 136(R10)
ADDQ R9, R8
// br3.fillFast32() // br3.fillFast32()
MOVQ 176(R11), R12 MOVQ 176(R10), R11
MOVBQZX 184(R11), R13 MOVBQZX 184(R10), R12
CMPQ R13, $0x20 CMPQ R12, $0x20
JBE skip_fill3 JBE skip_fill3
MOVQ 168(R11), AX MOVQ 168(R10), AX
SUBQ $0x20, R13 SUBQ $0x20, R12
SUBQ $0x04, AX SUBQ $0x04, AX
MOVQ 144(R11), R14 MOVQ 144(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63) // b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14 MOVL (AX)(R13*1), R13
MOVQ R13, CX MOVQ R12, CX
SHLQ CL, R14 SHLQ CL, R13
MOVQ AX, 168(R11) MOVQ AX, 168(R10)
ORQ R14, R12 ORQ R13, R11
// exhausted = exhausted || (br3.off < 4) // exhausted += (br3.off < 4)
CMPQ AX, $0x04 CMPQ AX, $0x04
SETLT AL ADCB $+0, DL
ORB AL, DL
skip_fill3: skip_fill3:
// val0 := br3.peekTopBits(peekBits) // val0 := br3.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v0 := table[val0&mask] // v0 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br3.advance(uint8(v0.entry) // br3.advance(uint8(v0.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val1 := br3.peekTopBits(peekBits) // val1 := br3.peekTopBits(peekBits)
MOVQ DI, CX MOVQ DI, CX
MOVQ R12, R14 MOVQ R11, R13
SHRQ CL, R14 SHRQ CL, R13
// v1 := table[val1&mask] // v1 := table[val1&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br3.advance(uint8(v1.entry)) // br3.advance(uint8(v1.entry))
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// these two writes get coalesced // these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8) LEAQ (R8)(R8*2), CX
MOVW AX, (BX)(CX*1)
// update the bitreader structure // update the bitreader structure
MOVQ R12, 176(R11) MOVQ R11, 176(R10)
MOVB R13, 184(R11) MOVB R12, 184(R10)
ADDQ $0x02, SI ADDQ $0x02, BX
TESTB DL, DL TESTB DL, DL
JZ main_loop JZ main_loop
MOVQ ctx+0(FP), AX MOVQ ctx+0(FP), AX
SUBQ 16(AX), SI SUBQ 16(AX), BX
SHLQ $0x02, SI SHLQ $0x02, BX
MOVQ SI, 40(AX) MOVQ BX, 40(AX)
RET RET
// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values // Preload values
MOVQ ctx+0(FP), CX MOVQ ctx+0(FP), CX
MOVBQZX 8(CX), DI MOVBQZX 8(CX), DI
MOVQ 16(CX), BX MOVQ 16(CX), BX
MOVQ 48(CX), SI MOVQ 48(CX), SI
MOVQ 24(CX), R9 MOVQ 24(CX), R8
MOVQ 32(CX), R10 MOVQ 32(CX), R9
MOVQ (CX), R11 MOVQ (CX), R10
// Main loop // Main loop
main_loop: main_loop:
MOVQ BX, R8 XORL DX, DX
CMPQ R8, SI CMPQ BX, SI
SETGE DL SETGE DL
// br0.fillFast32() // br0.fillFast32()
MOVQ 32(R11), R12 MOVQ 32(R10), R11
MOVBQZX 40(R11), R13 MOVBQZX 40(R10), R12
CMPQ R13, $0x20 CMPQ R12, $0x20
JBE skip_fill0 JBE skip_fill0
MOVQ 24(R11), R14 MOVQ 24(R10), R13
SUBQ $0x20, R13 SUBQ $0x20, R12
SUBQ $0x04, R14 SUBQ $0x04, R13
MOVQ (R11), R15 MOVQ (R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63) // b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15 MOVL (R13)(R14*1), R14
MOVQ R13, CX MOVQ R12, CX
SHLQ CL, R15 SHLQ CL, R14
MOVQ R14, 24(R11) MOVQ R13, 24(R10)
ORQ R15, R12 ORQ R14, R11
// exhausted = exhausted || (br0.off < 4) // exhausted += (br0.off < 4)
CMPQ R14, $0x04 CMPQ R13, $0x04
SETLT AL ADCB $+0, DL
ORB AL, DL
skip_fill0: skip_fill0:
// val0 := br0.peekTopBits(peekBits) // val0 := br0.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v0 := table[val0&mask] // v0 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br0.advance(uint8(v0.entry) // br0.advance(uint8(v0.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val1 := br0.peekTopBits(peekBits) // val1 := br0.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v1 := table[val0&mask] // v1 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br0.advance(uint8(v1.entry) // br0.advance(uint8(v1.entry)
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
BSWAPL AX BSWAPL AX
// val2 := br0.peekTopBits(peekBits) // val2 := br0.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v2 := table[val0&mask] // v2 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br0.advance(uint8(v2.entry) // br0.advance(uint8(v2.entry)
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val3 := br0.peekTopBits(peekBits) // val3 := br0.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v3 := table[val0&mask] // v3 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br0.advance(uint8(v3.entry) // br0.advance(uint8(v3.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
BSWAPL AX BSWAPL AX
// these four writes get coalesced // these four writes get coalesced
@ -365,88 +354,86 @@ skip_fill0:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8) MOVL AX, (BX)
// update the bitreader structure // update the bitreader structure
MOVQ R12, 32(R11) MOVQ R11, 32(R10)
MOVB R13, 40(R11) MOVB R12, 40(R10)
ADDQ R9, R8
// br1.fillFast32() // br1.fillFast32()
MOVQ 80(R11), R12 MOVQ 80(R10), R11
MOVBQZX 88(R11), R13 MOVBQZX 88(R10), R12
CMPQ R13, $0x20 CMPQ R12, $0x20
JBE skip_fill1 JBE skip_fill1
MOVQ 72(R11), R14 MOVQ 72(R10), R13
SUBQ $0x20, R13 SUBQ $0x20, R12
SUBQ $0x04, R14 SUBQ $0x04, R13
MOVQ 48(R11), R15 MOVQ 48(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63) // b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15 MOVL (R13)(R14*1), R14
MOVQ R13, CX MOVQ R12, CX
SHLQ CL, R15 SHLQ CL, R14
MOVQ R14, 72(R11) MOVQ R13, 72(R10)
ORQ R15, R12 ORQ R14, R11
// exhausted = exhausted || (br1.off < 4) // exhausted += (br1.off < 4)
CMPQ R14, $0x04 CMPQ R13, $0x04
SETLT AL ADCB $+0, DL
ORB AL, DL
skip_fill1: skip_fill1:
// val0 := br1.peekTopBits(peekBits) // val0 := br1.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v0 := table[val0&mask] // v0 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br1.advance(uint8(v0.entry) // br1.advance(uint8(v0.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val1 := br1.peekTopBits(peekBits) // val1 := br1.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v1 := table[val0&mask] // v1 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br1.advance(uint8(v1.entry) // br1.advance(uint8(v1.entry)
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
BSWAPL AX BSWAPL AX
// val2 := br1.peekTopBits(peekBits) // val2 := br1.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v2 := table[val0&mask] // v2 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br1.advance(uint8(v2.entry) // br1.advance(uint8(v2.entry)
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val3 := br1.peekTopBits(peekBits) // val3 := br1.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v3 := table[val0&mask] // v3 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br1.advance(uint8(v3.entry) // br1.advance(uint8(v3.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
BSWAPL AX BSWAPL AX
// these four writes get coalesced // these four writes get coalesced
@ -454,88 +441,86 @@ skip_fill1:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8) MOVL AX, (BX)(R8*1)
// update the bitreader structure // update the bitreader structure
MOVQ R12, 80(R11) MOVQ R11, 80(R10)
MOVB R13, 88(R11) MOVB R12, 88(R10)
ADDQ R9, R8
// br2.fillFast32() // br2.fillFast32()
MOVQ 128(R11), R12 MOVQ 128(R10), R11
MOVBQZX 136(R11), R13 MOVBQZX 136(R10), R12
CMPQ R13, $0x20 CMPQ R12, $0x20
JBE skip_fill2 JBE skip_fill2
MOVQ 120(R11), R14 MOVQ 120(R10), R13
SUBQ $0x20, R13 SUBQ $0x20, R12
SUBQ $0x04, R14 SUBQ $0x04, R13
MOVQ 96(R11), R15 MOVQ 96(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63) // b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15 MOVL (R13)(R14*1), R14
MOVQ R13, CX MOVQ R12, CX
SHLQ CL, R15 SHLQ CL, R14
MOVQ R14, 120(R11) MOVQ R13, 120(R10)
ORQ R15, R12 ORQ R14, R11
// exhausted = exhausted || (br2.off < 4) // exhausted += (br2.off < 4)
CMPQ R14, $0x04 CMPQ R13, $0x04
SETLT AL ADCB $+0, DL
ORB AL, DL
skip_fill2: skip_fill2:
// val0 := br2.peekTopBits(peekBits) // val0 := br2.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v0 := table[val0&mask] // v0 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br2.advance(uint8(v0.entry) // br2.advance(uint8(v0.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val1 := br2.peekTopBits(peekBits) // val1 := br2.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v1 := table[val0&mask] // v1 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br2.advance(uint8(v1.entry) // br2.advance(uint8(v1.entry)
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
BSWAPL AX BSWAPL AX
// val2 := br2.peekTopBits(peekBits) // val2 := br2.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v2 := table[val0&mask] // v2 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br2.advance(uint8(v2.entry) // br2.advance(uint8(v2.entry)
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val3 := br2.peekTopBits(peekBits) // val3 := br2.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v3 := table[val0&mask] // v3 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br2.advance(uint8(v3.entry) // br2.advance(uint8(v3.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
BSWAPL AX BSWAPL AX
// these four writes get coalesced // these four writes get coalesced
@ -543,88 +528,86 @@ skip_fill2:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8) MOVL AX, (BX)(R8*2)
// update the bitreader structure // update the bitreader structure
MOVQ R12, 128(R11) MOVQ R11, 128(R10)
MOVB R13, 136(R11) MOVB R12, 136(R10)
ADDQ R9, R8
// br3.fillFast32() // br3.fillFast32()
MOVQ 176(R11), R12 MOVQ 176(R10), R11
MOVBQZX 184(R11), R13 MOVBQZX 184(R10), R12
CMPQ R13, $0x20 CMPQ R12, $0x20
JBE skip_fill3 JBE skip_fill3
MOVQ 168(R11), R14 MOVQ 168(R10), R13
SUBQ $0x20, R13 SUBQ $0x20, R12
SUBQ $0x04, R14 SUBQ $0x04, R13
MOVQ 144(R11), R15 MOVQ 144(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63) // b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15 MOVL (R13)(R14*1), R14
MOVQ R13, CX MOVQ R12, CX
SHLQ CL, R15 SHLQ CL, R14
MOVQ R14, 168(R11) MOVQ R13, 168(R10)
ORQ R15, R12 ORQ R14, R11
// exhausted = exhausted || (br3.off < 4) // exhausted += (br3.off < 4)
CMPQ R14, $0x04 CMPQ R13, $0x04
SETLT AL ADCB $+0, DL
ORB AL, DL
skip_fill3: skip_fill3:
// val0 := br3.peekTopBits(peekBits) // val0 := br3.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v0 := table[val0&mask] // v0 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br3.advance(uint8(v0.entry) // br3.advance(uint8(v0.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val1 := br3.peekTopBits(peekBits) // val1 := br3.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v1 := table[val0&mask] // v1 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br3.advance(uint8(v1.entry) // br3.advance(uint8(v1.entry)
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
BSWAPL AX BSWAPL AX
// val2 := br3.peekTopBits(peekBits) // val2 := br3.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v2 := table[val0&mask] // v2 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br3.advance(uint8(v2.entry) // br3.advance(uint8(v2.entry)
MOVB CH, AH MOVB CH, AH
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
// val3 := br3.peekTopBits(peekBits) // val3 := br3.peekTopBits(peekBits)
MOVQ R12, R14 MOVQ R11, R13
MOVQ DI, CX MOVQ DI, CX
SHRQ CL, R14 SHRQ CL, R13
// v3 := table[val0&mask] // v3 := table[val0&mask]
MOVW (R10)(R14*2), CX MOVW (R9)(R13*2), CX
// br3.advance(uint8(v3.entry) // br3.advance(uint8(v3.entry)
MOVB CH, AL MOVB CH, AL
SHLQ CL, R12 SHLQ CL, R11
ADDB CL, R13 ADDB CL, R12
BSWAPL AX BSWAPL AX
// these four writes get coalesced // these four writes get coalesced
@ -632,11 +615,12 @@ skip_fill3:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8) LEAQ (R8)(R8*2), CX
MOVL AX, (BX)(CX*1)
// update the bitreader structure // update the bitreader structure
MOVQ R12, 176(R11) MOVQ R11, 176(R10)
MOVB R13, 184(R11) MOVB R12, 184(R10)
ADDQ $0x04, BX ADDQ $0x04, BX
TESTB DL, DL TESTB DL, DL
JZ main_loop JZ main_loop
@ -652,7 +636,7 @@ TEXT ·decompress1x_main_loop_amd64(SB), $0-8
MOVQ 16(CX), DX MOVQ 16(CX), DX
MOVQ 24(CX), BX MOVQ 24(CX), BX
CMPQ BX, $0x04 CMPQ BX, $0x04
JB error_max_decoded_size_exeeded JB error_max_decoded_size_exceeded
LEAQ (DX)(BX*1), BX LEAQ (DX)(BX*1), BX
MOVQ (CX), SI MOVQ (CX), SI
MOVQ (SI), R8 MOVQ (SI), R8
@ -667,7 +651,7 @@ main_loop:
// Check if we have room for 4 bytes in the output buffer // Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX LEAQ 4(DX), CX
CMPQ CX, BX CMPQ CX, BX
JGE error_max_decoded_size_exeeded JGE error_max_decoded_size_exceeded
// Decode 4 values // Decode 4 values
CMPQ R11, $0x20 CMPQ R11, $0x20
@ -744,7 +728,7 @@ loop_condition:
RET RET
// Report error // Report error
error_max_decoded_size_exeeded: error_max_decoded_size_exceeded:
MOVQ ctx+0(FP), AX MOVQ ctx+0(FP), AX
MOVQ $-1, CX MOVQ $-1, CX
MOVQ CX, 40(AX) MOVQ CX, 40(AX)
@ -757,7 +741,7 @@ TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
MOVQ 16(CX), DX MOVQ 16(CX), DX
MOVQ 24(CX), BX MOVQ 24(CX), BX
CMPQ BX, $0x04 CMPQ BX, $0x04
JB error_max_decoded_size_exeeded JB error_max_decoded_size_exceeded
LEAQ (DX)(BX*1), BX LEAQ (DX)(BX*1), BX
MOVQ (CX), SI MOVQ (CX), SI
MOVQ (SI), R8 MOVQ (SI), R8
@ -772,7 +756,7 @@ main_loop:
// Check if we have room for 4 bytes in the output buffer // Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX LEAQ 4(DX), CX
CMPQ CX, BX CMPQ CX, BX
JGE error_max_decoded_size_exeeded JGE error_max_decoded_size_exceeded
// Decode 4 values // Decode 4 values
CMPQ R11, $0x20 CMPQ R11, $0x20
@ -839,7 +823,7 @@ loop_condition:
RET RET
// Report error // Report error
error_max_decoded_size_exeeded: error_max_decoded_size_exceeded:
MOVQ ctx+0(FP), AX MOVQ ctx+0(FP), AX
MOVQ $-1, CX MOVQ $-1, CX
MOVQ CX, 40(AX) MOVQ CX, 40(AX)

View File

@ -103,6 +103,28 @@ func hash(u, shift uint32) uint32 {
return (u * 0x1e35a7bd) >> shift return (u * 0x1e35a7bd) >> shift
} }
// EncodeBlockInto exposes encodeBlock but checks dst size.
func EncodeBlockInto(dst, src []byte) (d int) {
if MaxEncodedLen(len(src)) > len(dst) {
return 0
}
// encodeBlock breaks on too big blocks, so split.
for len(src) > 0 {
p := src
src = nil
if len(p) > maxBlockSize {
p, src = p[:maxBlockSize], p[maxBlockSize:]
}
if len(p) < minNonLiteralBlockSize {
d += emitLiteral(dst[d:], p)
} else {
d += encodeBlock(dst[d:], p)
}
}
return d
}
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already // assumes that the varint-encoded length of the decompressed bytes has already
// been written. // been written.

View File

@ -9,6 +9,7 @@ import (
"encoding/binary" "encoding/binary"
"errors" "errors"
"fmt" "fmt"
"hash/crc32"
"io" "io"
"os" "os"
"path/filepath" "path/filepath"
@ -82,8 +83,9 @@ type blockDec struct {
err error err error
// Check against this crc // Check against this crc, if hasCRC is true.
checkCRC []byte checkCRC uint32
hasCRC bool
// Frame to use for singlethreaded decoding. // Frame to use for singlethreaded decoding.
// Should not be used by the decoder itself since parent may be another frame. // Should not be used by the decoder itself since parent may be another frame.
@ -191,16 +193,14 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
} }
// Read block data. // Read block data.
if cap(b.dataStorage) < cSize { if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize {
// byteBuf doesn't need a destination buffer.
if b.lowMem || cSize > maxCompressedBlockSize { if b.lowMem || cSize > maxCompressedBlockSize {
b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc) b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
} else { } else {
b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc) b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
} }
} }
if cap(b.dst) <= maxSize {
b.dst = make([]byte, 0, maxSize+1)
}
b.data, err = br.readBig(cSize, b.dataStorage) b.data, err = br.readBig(cSize, b.dataStorage)
if err != nil { if err != nil {
if debugDecoder { if debugDecoder {
@ -209,6 +209,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
} }
return err return err
} }
if cap(b.dst) <= maxSize {
b.dst = make([]byte, 0, maxSize+1)
}
return nil return nil
} }
@ -440,6 +443,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
} }
} }
var err error var err error
if debugDecoder {
println("huff table input:", len(literals), "CRC:", crc32.ChecksumIEEE(literals))
}
huff, literals, err = huff0.ReadTable(literals, huff) huff, literals, err = huff0.ReadTable(literals, huff)
if err != nil { if err != nil {
println("reading huffman table:", err) println("reading huffman table:", err)

View File

@ -54,7 +54,7 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
func (b *byteBuf) readByte() (byte, error) { func (b *byteBuf) readByte() (byte, error) {
bb := *b bb := *b
if len(bb) < 1 { if len(bb) < 1 {
return 0, nil return 0, io.ErrUnexpectedEOF
} }
r := bb[0] r := bb[0]
*b = bb[1:] *b = bb[1:]

View File

@ -4,7 +4,6 @@
package zstd package zstd
import ( import (
"bytes"
"encoding/binary" "encoding/binary"
"errors" "errors"
"io" "io"
@ -102,8 +101,8 @@ func (h *Header) Decode(in []byte) error {
} }
h.HeaderSize += 4 h.HeaderSize += 4
b, in := in[:4], in[4:] b, in := in[:4], in[4:]
if !bytes.Equal(b, frameMagic) { if string(b) != frameMagic {
if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 { if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 {
return ErrMagicMismatch return ErrMagicMismatch
} }
if len(in) < 4 { if len(in) < 4 {
@ -153,7 +152,7 @@ func (h *Header) Decode(in []byte) error {
} }
b, in = in[:size], in[size:] b, in = in[:size], in[size:]
h.HeaderSize += int(size) h.HeaderSize += int(size)
switch size { switch len(b) {
case 1: case 1:
h.DictionaryID = uint32(b[0]) h.DictionaryID = uint32(b[0])
case 2: case 2:
@ -183,7 +182,7 @@ func (h *Header) Decode(in []byte) error {
} }
b, in = in[:fcsSize], in[fcsSize:] b, in = in[:fcsSize], in[fcsSize:]
h.HeaderSize += int(fcsSize) h.HeaderSize += int(fcsSize)
switch fcsSize { switch len(b) {
case 1: case 1:
h.FrameContentSize = uint64(b[0]) h.FrameContentSize = uint64(b[0])
case 2: case 2:

View File

@ -5,7 +5,6 @@
package zstd package zstd
import ( import (
"bytes"
"context" "context"
"encoding/binary" "encoding/binary"
"io" "io"
@ -41,8 +40,7 @@ type Decoder struct {
frame *frameDec frame *frameDec
// Custom dictionaries. // Custom dictionaries.
// Always uses copies. dicts map[uint32]*dict
dicts map[uint32]dict
// streamWg is the waitgroup for all streams // streamWg is the waitgroup for all streams
streamWg sync.WaitGroup streamWg sync.WaitGroup
@ -104,7 +102,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
} }
// Transfer option dicts. // Transfer option dicts.
d.dicts = make(map[uint32]dict, len(d.o.dicts)) d.dicts = make(map[uint32]*dict, len(d.o.dicts))
for _, dc := range d.o.dicts { for _, dc := range d.o.dicts {
d.dicts[dc.id] = dc d.dicts[dc.id] = dc
} }
@ -342,15 +340,8 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
} }
return dst, err return dst, err
} }
if frame.DictionaryID != nil { if err = d.setDict(frame); err != nil {
dict, ok := d.dicts[*frame.DictionaryID] return nil, err
if !ok {
return nil, ErrUnknownDictionary
}
if debugDecoder {
println("setting dict", frame.DictionaryID)
}
frame.history.setDict(&dict)
} }
if frame.WindowSize > d.o.maxWindowSize { if frame.WindowSize > d.o.maxWindowSize {
if debugDecoder { if debugDecoder {
@ -459,7 +450,11 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp) println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
} }
if !d.o.ignoreChecksum && len(next.b) > 0 { if d.o.ignoreChecksum {
return true
}
if len(next.b) > 0 {
n, err := d.current.crc.Write(next.b) n, err := d.current.crc.Write(next.b)
if err == nil { if err == nil {
if n != len(next.b) { if n != len(next.b) {
@ -467,18 +462,16 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
} }
} }
} }
if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 { if next.err == nil && next.d != nil && next.d.hasCRC {
got := d.current.crc.Sum64() got := uint32(d.current.crc.Sum64())
var tmp [4]byte if got != next.d.checkCRC {
binary.LittleEndian.PutUint32(tmp[:], uint32(got))
if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
if debugDecoder { if debugDecoder {
println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)") printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC)
} }
d.current.err = ErrCRCMismatch d.current.err = ErrCRCMismatch
} else { } else {
if debugDecoder { if debugDecoder {
println("CRC ok", tmp[:]) printf("CRC ok %08x\n", got)
} }
} }
} }
@ -494,18 +487,12 @@ func (d *Decoder) nextBlockSync() (ok bool) {
if !d.syncStream.inFrame { if !d.syncStream.inFrame {
d.frame.history.reset() d.frame.history.reset()
d.current.err = d.frame.reset(&d.syncStream.br) d.current.err = d.frame.reset(&d.syncStream.br)
if d.current.err == nil {
d.current.err = d.setDict(d.frame)
}
if d.current.err != nil { if d.current.err != nil {
return false return false
} }
if d.frame.DictionaryID != nil {
dict, ok := d.dicts[*d.frame.DictionaryID]
if !ok {
d.current.err = ErrUnknownDictionary
return false
} else {
d.frame.history.setDict(&dict)
}
}
if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize { if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
d.current.err = ErrDecoderSizeExceeded d.current.err = ErrDecoderSizeExceeded
return false return false
@ -864,13 +851,8 @@ decodeStream:
if debugDecoder && err != nil { if debugDecoder && err != nil {
println("Frame decoder returned", err) println("Frame decoder returned", err)
} }
if err == nil && frame.DictionaryID != nil { if err == nil {
dict, ok := d.dicts[*frame.DictionaryID] err = d.setDict(frame)
if !ok {
err = ErrUnknownDictionary
} else {
frame.history.setDict(&dict)
}
} }
if err == nil && d.frame.WindowSize > d.o.maxWindowSize { if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
if debugDecoder { if debugDecoder {
@ -918,18 +900,22 @@ decodeStream:
println("next block returned error:", err) println("next block returned error:", err)
} }
dec.err = err dec.err = err
dec.checkCRC = nil dec.hasCRC = false
if dec.Last && frame.HasCheckSum && err == nil { if dec.Last && frame.HasCheckSum && err == nil {
crc, err := frame.rawInput.readSmall(4) crc, err := frame.rawInput.readSmall(4)
if err != nil { if len(crc) < 4 {
if err == nil {
err = io.ErrUnexpectedEOF
}
println("CRC missing?", err) println("CRC missing?", err)
dec.err = err dec.err = err
} } else {
var tmp [4]byte dec.checkCRC = binary.LittleEndian.Uint32(crc)
copy(tmp[:], crc) dec.hasCRC = true
dec.checkCRC = tmp[:] if debugDecoder {
if debugDecoder { printf("found crc to check: %08x\n", dec.checkCRC)
println("found crc to check:", dec.checkCRC) }
} }
} }
err = dec.err err = dec.err
@ -948,3 +934,20 @@ decodeStream:
hist.reset() hist.reset()
d.frame.history.b = frameHistCache d.frame.history.b = frameHistCache
} }
func (d *Decoder) setDict(frame *frameDec) (err error) {
dict, ok := d.dicts[frame.DictionaryID]
if ok {
if debugDecoder {
println("setting dict", frame.DictionaryID)
}
frame.history.setDict(dict)
} else if frame.DictionaryID != 0 {
// A zero or missing dictionary id is ambiguous:
// either dictionary zero, or no dictionary. In particular,
// zstd --patch-from uses this id for the source file,
// so only return an error if the dictionary id is not zero.
err = ErrUnknownDictionary
}
return err
}

View File

@ -6,6 +6,8 @@ package zstd
import ( import (
"errors" "errors"
"fmt"
"math/bits"
"runtime" "runtime"
) )
@ -18,7 +20,7 @@ type decoderOptions struct {
concurrent int concurrent int
maxDecodedSize uint64 maxDecodedSize uint64
maxWindowSize uint64 maxWindowSize uint64
dicts []dict dicts []*dict
ignoreChecksum bool ignoreChecksum bool
limitToCap bool limitToCap bool
decodeBufsBelow int decodeBufsBelow int
@ -85,7 +87,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
} }
// WithDecoderDicts allows to register one or more dictionaries for the decoder. // WithDecoderDicts allows to register one or more dictionaries for the decoder.
// If several dictionaries with the same ID is provided the last one will be used. //
// Each slice in dict must be in the [dictionary format] produced by
// "zstd --train" from the Zstandard reference implementation.
//
// If several dictionaries with the same ID are provided, the last one will be used.
//
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithDecoderDicts(dicts ...[]byte) DOption { func WithDecoderDicts(dicts ...[]byte) DOption {
return func(o *decoderOptions) error { return func(o *decoderOptions) error {
for _, b := range dicts { for _, b := range dicts {
@ -93,12 +101,24 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
if err != nil { if err != nil {
return err return err
} }
o.dicts = append(o.dicts, *d) o.dicts = append(o.dicts, d)
} }
return nil return nil
} }
} }
// WithEncoderDictRaw registers a dictionary that may be used by the decoder.
// The slice content can be arbitrary data.
func WithDecoderDictRaw(id uint32, content []byte) DOption {
return func(o *decoderOptions) error {
if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
}
o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
return nil
}
}
// WithDecoderMaxWindow allows to set a maximum window size for decodes. // WithDecoderMaxWindow allows to set a maximum window size for decodes.
// This allows rejecting packets that will cause big memory usage. // This allows rejecting packets that will cause big memory usage.
// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting. // The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.

View File

@ -1,7 +1,6 @@
package zstd package zstd
import ( import (
"bytes"
"encoding/binary" "encoding/binary"
"errors" "errors"
"fmt" "fmt"
@ -20,7 +19,10 @@ type dict struct {
content []byte content []byte
} }
var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec} const dictMagic = "\x37\xa4\x30\xec"
// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
const dictMaxLength = 1 << 31
// ID returns the dictionary id or 0 if d is nil. // ID returns the dictionary id or 0 if d is nil.
func (d *dict) ID() uint32 { func (d *dict) ID() uint32 {
@ -30,14 +32,38 @@ func (d *dict) ID() uint32 {
return d.id return d.id
} }
// DictContentSize returns the dictionary content size or 0 if d is nil. // ContentSize returns the dictionary content size or 0 if d is nil.
func (d *dict) DictContentSize() int { func (d *dict) ContentSize() int {
if d == nil { if d == nil {
return 0 return 0
} }
return len(d.content) return len(d.content)
} }
// Content returns the dictionary content.
func (d *dict) Content() []byte {
if d == nil {
return nil
}
return d.content
}
// Offsets returns the initial offsets.
func (d *dict) Offsets() [3]int {
if d == nil {
return [3]int{}
}
return d.offsets
}
// LitEncoder returns the literal encoder.
func (d *dict) LitEncoder() *huff0.Scratch {
if d == nil {
return nil
}
return d.litEnc
}
// Load a dictionary as described in // Load a dictionary as described in
// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format // https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
func loadDict(b []byte) (*dict, error) { func loadDict(b []byte) (*dict, error) {
@ -50,7 +76,7 @@ func loadDict(b []byte) (*dict, error) {
ofDec: sequenceDec{fse: &fseDecoder{}}, ofDec: sequenceDec{fse: &fseDecoder{}},
mlDec: sequenceDec{fse: &fseDecoder{}}, mlDec: sequenceDec{fse: &fseDecoder{}},
} }
if !bytes.Equal(b[:4], dictMagic[:]) { if string(b[:4]) != dictMagic {
return nil, ErrMagicMismatch return nil, ErrMagicMismatch
} }
d.id = binary.LittleEndian.Uint32(b[4:8]) d.id = binary.LittleEndian.Uint32(b[4:8])
@ -62,7 +88,7 @@ func loadDict(b []byte) (*dict, error) {
var err error var err error
d.litEnc, b, err = huff0.ReadTable(b[8:], nil) d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("loading literal table: %w", err)
} }
d.litEnc.Reuse = huff0.ReusePolicyMust d.litEnc.Reuse = huff0.ReusePolicyMust
@ -120,3 +146,16 @@ func loadDict(b []byte) (*dict, error) {
return &d, nil return &d, nil
} }
// InspectDictionary loads a zstd dictionary and provides functions to inspect the content.
func InspectDictionary(b []byte) (interface {
ID() uint32
ContentSize() int
Content() []byte
Offsets() [3]int
LitEncoder() *huff0.Scratch
}, error) {
initPredefined()
d, err := loadDict(b)
return d, err
}

View File

@ -16,6 +16,7 @@ type fastBase struct {
cur int32 cur int32
// maximum offset. Should be at least 2x block size. // maximum offset. Should be at least 2x block size.
maxMatchOff int32 maxMatchOff int32
bufferReset int32
hist []byte hist []byte
crc *xxhash.Digest crc *xxhash.Digest
tmp [8]byte tmp [8]byte
@ -56,8 +57,8 @@ func (e *fastBase) Block() *blockEnc {
} }
func (e *fastBase) addBlock(src []byte) int32 { func (e *fastBase) addBlock(src []byte) int32 {
if debugAsserts && e.cur > bufferReset { if debugAsserts && e.cur > e.bufferReset {
panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset)) panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset))
} }
// check if we have space already // check if we have space already
if len(e.hist)+len(src) > cap(e.hist) { if len(e.hist)+len(src) > cap(e.hist) {
@ -126,24 +127,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize)) panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
} }
} }
a := src[s:] return int32(matchLen(src[s:], src[t:]))
b := src[t:]
b = b[:len(a)]
end := int32((len(a) >> 3) << 3)
for i := int32(0); i < end; i += 8 {
if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
return i + int32(bits.TrailingZeros64(diff)>>3)
}
}
a = a[end:]
b = b[end:]
for i := range a {
if a[i] != b[i] {
return int32(i) + end
}
}
return int32(len(a)) + end
} }
// Reset the encoding table. // Reset the encoding table.
@ -165,13 +149,13 @@ func (e *fastBase) resetBase(d *dict, singleBlock bool) {
if singleBlock { if singleBlock {
e.lowMem = true e.lowMem = true
} }
e.ensureHist(d.DictContentSize() + maxCompressedBlockSize) e.ensureHist(d.ContentSize() + maxCompressedBlockSize)
e.lowMem = low e.lowMem = low
} }
// We offset current position so everything will be out of reach. // We offset current position so everything will be out of reach.
// If above reset line, history will be purged. // If above reset line, history will be purged.
if e.cur < bufferReset { if e.cur < e.bufferReset {
e.cur += e.maxMatchOff + int32(len(e.hist)) e.cur += e.maxMatchOff + int32(len(e.hist))
} }
e.hist = e.hist[:0] e.hist = e.hist[:0]

View File

@ -32,7 +32,6 @@ type match struct {
length int32 length int32
rep int32 rep int32
est int32 est int32
_ [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes
} }
const highScore = 25000 const highScore = 25000
@ -85,14 +84,10 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
) )
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
for e.cur >= bufferReset { for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 { if len(e.hist) == 0 {
for i := range e.table[:] { e.table = [bestShortTableSize]prevEntry{}
e.table[i] = prevEntry{} e.longTable = [bestLongTableSize]prevEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = prevEntry{}
}
e.cur = e.maxMatchOff e.cur = e.maxMatchOff
break break
} }
@ -193,12 +188,6 @@ encodeLoop:
panic("offset0 was 0") panic("offset0 was 0")
} }
bestOf := func(a, b match) match {
if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
return a
}
return b
}
const goodEnough = 100 const goodEnough = 100
nextHashL := hashLen(cv, bestLongTableBits, bestLongLen) nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
@ -206,36 +195,41 @@ encodeLoop:
candidateL := e.longTable[nextHashL] candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS] candidateS := e.table[nextHashS]
matchAt := func(offset int32, s int32, first uint32, rep int32) match { // Set m to a match at offset if it looks like that will improve compression.
improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
if s-offset >= e.maxMatchOff || load3232(src, offset) != first { if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
return match{s: s, est: highScore} return
} }
if debugAsserts { if debugAsserts {
if !bytes.Equal(src[s:s+4], src[offset:offset+4]) { if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first)) panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
} }
} }
m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep} cand := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
m.estBits(bitsPerByte) cand.estBits(bitsPerByte)
return m if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
*m = cand
}
} }
best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)) best := match{s: s, est: highScore}
best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)) improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1)) improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)
if canRepeat && best.length < goodEnough { if canRepeat && best.length < goodEnough {
cv32 := uint32(cv >> 8) cv32 := uint32(cv >> 8)
spp := s + 1 spp := s + 1
best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1)) improve(&best, spp-offset1, spp, cv32, 1)
best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2)) improve(&best, spp-offset2, spp, cv32, 2)
best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3)) improve(&best, spp-offset3, spp, cv32, 3)
if best.length > 0 { if best.length > 0 {
cv32 = uint32(cv >> 24) cv32 = uint32(cv >> 24)
spp += 2 spp += 2
best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1)) improve(&best, spp-offset1, spp, cv32, 1)
best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2)) improve(&best, spp-offset2, spp, cv32, 2)
best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3)) improve(&best, spp-offset3, spp, cv32, 3)
} }
} }
// Load next and check... // Load next and check...
@ -262,28 +256,30 @@ encodeLoop:
candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)] candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
// Short at s+1 // Short at s+1
best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)) improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
// Long at s+1, s+2 // Long at s+1, s+2
best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)) improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)) improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1)) improve(&best, candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1)) improve(&best, candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
if false { if false {
// Short at s+3. // Short at s+3.
// Too often worse... // Too often worse...
best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)) improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)
} }
// See if we can find a better match by checking where the current best ends. // See if we can find a better match by checking where the current best ends.
// Use that offset to see if we can find a better full match. // Use that offset to see if we can find a better full match.
if sAt := best.s + best.length; sAt < sLimit { if sAt := best.s + best.length; sAt < sLimit {
nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen) nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
candidateEnd := e.longTable[nextHashL] candidateEnd := e.longTable[nextHashL]
if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 { // Start check at a fixed offset to allow for a few mismatches.
bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1)) // For this compression level 2 yields the best results.
if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 { const skipBeginning = 2
bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1)) if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 {
improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 {
improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
} }
best = bestEnd
} }
} }
} }

View File

@ -62,14 +62,10 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
) )
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
for e.cur >= bufferReset { for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 { if len(e.hist) == 0 {
for i := range e.table[:] { e.table = [betterShortTableSize]tableEntry{}
e.table[i] = tableEntry{} e.longTable = [betterLongTableSize]prevEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = prevEntry{}
}
e.cur = e.maxMatchOff e.cur = e.maxMatchOff
break break
} }
@ -587,7 +583,7 @@ func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
) )
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
for e.cur >= bufferReset { for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 { if len(e.hist) == 0 {
for i := range e.table[:] { for i := range e.table[:] {
e.table[i] = tableEntry{} e.table[i] = tableEntry{}

View File

@ -44,14 +44,10 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
) )
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
for e.cur >= bufferReset { for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 { if len(e.hist) == 0 {
for i := range e.table[:] { e.table = [dFastShortTableSize]tableEntry{}
e.table[i] = tableEntry{} e.longTable = [dFastLongTableSize]tableEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = tableEntry{}
}
e.cur = e.maxMatchOff e.cur = e.maxMatchOff
break break
} }
@ -388,7 +384,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
) )
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
if e.cur >= bufferReset { if e.cur >= e.bufferReset {
for i := range e.table[:] { for i := range e.table[:] {
e.table[i] = tableEntry{} e.table[i] = tableEntry{}
} }
@ -685,7 +681,7 @@ encodeLoop:
} }
// We do not store history, so we must offset e.cur to avoid false matches for next user. // We do not store history, so we must offset e.cur to avoid false matches for next user.
if e.cur < bufferReset { if e.cur < e.bufferReset {
e.cur += int32(len(src)) e.cur += int32(len(src))
} }
} }
@ -700,7 +696,7 @@ func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
) )
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
for e.cur >= bufferReset { for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 { if len(e.hist) == 0 {
for i := range e.table[:] { for i := range e.table[:] {
e.table[i] = tableEntry{} e.table[i] = tableEntry{}

View File

@ -43,7 +43,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
) )
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
for e.cur >= bufferReset { for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 { if len(e.hist) == 0 {
for i := range e.table[:] { for i := range e.table[:] {
e.table[i] = tableEntry{} e.table[i] = tableEntry{}
@ -310,7 +310,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
} }
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
if e.cur >= bufferReset { if e.cur >= e.bufferReset {
for i := range e.table[:] { for i := range e.table[:] {
e.table[i] = tableEntry{} e.table[i] = tableEntry{}
} }
@ -538,7 +538,7 @@ encodeLoop:
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits) println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
} }
// We do not store history, so we must offset e.cur to avoid false matches for next user. // We do not store history, so we must offset e.cur to avoid false matches for next user.
if e.cur < bufferReset { if e.cur < e.bufferReset {
e.cur += int32(len(src)) e.cur += int32(len(src))
} }
} }
@ -555,11 +555,9 @@ func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
return return
} }
// Protect against e.cur wraparound. // Protect against e.cur wraparound.
for e.cur >= bufferReset { for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 { if len(e.hist) == 0 {
for i := range e.table[:] { e.table = [tableSize]tableEntry{}
e.table[i] = tableEntry{}
}
e.cur = e.maxMatchOff e.cur = e.maxMatchOff
break break
} }

View File

@ -8,6 +8,7 @@ import (
"crypto/rand" "crypto/rand"
"fmt" "fmt"
"io" "io"
"math"
rdebug "runtime/debug" rdebug "runtime/debug"
"sync" "sync"
@ -639,3 +640,37 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
} }
return dst return dst
} }
// MaxEncodedSize returns the expected maximum
// size of an encoded block or stream.
func (e *Encoder) MaxEncodedSize(size int) int {
frameHeader := 4 + 2 // magic + frame header & window descriptor
if e.o.dict != nil {
frameHeader += 4
}
// Frame content size:
if size < 256 {
frameHeader++
} else if size < 65536+256 {
frameHeader += 2
} else if size < math.MaxInt32 {
frameHeader += 4
} else {
frameHeader += 8
}
// Final crc
if e.o.crc {
frameHeader += 4
}
// Max overhead is 3 bytes/block.
// There cannot be 0 blocks.
blocks := (size + e.o.blockSize) / e.o.blockSize
// Combine, add padding.
maxSz := frameHeader + 3*blocks + size
if e.o.pad > 1 {
maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad))
}
return maxSz
}

View File

@ -3,6 +3,8 @@ package zstd
import ( import (
"errors" "errors"
"fmt" "fmt"
"math"
"math/bits"
"runtime" "runtime"
"strings" "strings"
) )
@ -47,22 +49,22 @@ func (o encoderOptions) encoder() encoder {
switch o.level { switch o.level {
case SpeedFastest: case SpeedFastest:
if o.dict != nil { if o.dict != nil {
return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
} }
return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
case SpeedDefault: case SpeedDefault:
if o.dict != nil { if o.dict != nil {
return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}} return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}}
} }
return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
case SpeedBetterCompression: case SpeedBetterCompression:
if o.dict != nil { if o.dict != nil {
return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
} }
return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
case SpeedBestCompression: case SpeedBestCompression:
return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
} }
panic("unknown compression level") panic("unknown compression level")
} }
@ -304,7 +306,13 @@ func WithLowerEncoderMem(b bool) EOption {
} }
// WithEncoderDict allows to register a dictionary that will be used for the encode. // WithEncoderDict allows to register a dictionary that will be used for the encode.
//
// The slice dict must be in the [dictionary format] produced by
// "zstd --train" from the Zstandard reference implementation.
//
// The encoder *may* choose to use no dictionary instead for certain payloads. // The encoder *may* choose to use no dictionary instead for certain payloads.
//
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithEncoderDict(dict []byte) EOption { func WithEncoderDict(dict []byte) EOption {
return func(o *encoderOptions) error { return func(o *encoderOptions) error {
d, err := loadDict(dict) d, err := loadDict(dict)
@ -315,3 +323,17 @@ func WithEncoderDict(dict []byte) EOption {
return nil return nil
} }
} }
// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
//
// The slice content may contain arbitrary data. It will be used as an initial
// history.
func WithEncoderDictRaw(id uint32, content []byte) EOption {
return func(o *encoderOptions) error {
if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
}
o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
return nil
}
}

View File

@ -5,7 +5,7 @@
package zstd package zstd
import ( import (
"bytes" "encoding/binary"
"encoding/hex" "encoding/hex"
"errors" "errors"
"io" "io"
@ -29,7 +29,7 @@ type frameDec struct {
FrameContentSize uint64 FrameContentSize uint64
DictionaryID *uint32 DictionaryID uint32
HasCheckSum bool HasCheckSum bool
SingleSegment bool SingleSegment bool
} }
@ -43,9 +43,9 @@ const (
MaxWindowSize = 1 << 29 MaxWindowSize = 1 << 29
) )
var ( const (
frameMagic = []byte{0x28, 0xb5, 0x2f, 0xfd} frameMagic = "\x28\xb5\x2f\xfd"
skippableFrameMagic = []byte{0x2a, 0x4d, 0x18} skippableFrameMagic = "\x2a\x4d\x18"
) )
func newFrameDec(o decoderOptions) *frameDec { func newFrameDec(o decoderOptions) *frameDec {
@ -89,9 +89,9 @@ func (d *frameDec) reset(br byteBuffer) error {
copy(signature[1:], b) copy(signature[1:], b)
} }
if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 { if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 {
if debugDecoder { if debugDecoder {
println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic)) println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic)))
} }
// Break if not skippable frame. // Break if not skippable frame.
break break
@ -114,9 +114,9 @@ func (d *frameDec) reset(br byteBuffer) error {
return err return err
} }
} }
if !bytes.Equal(signature[:], frameMagic) { if string(signature[:]) != frameMagic {
if debugDecoder { if debugDecoder {
println("Got magic numbers: ", signature, "want:", frameMagic) println("Got magic numbers: ", signature, "want:", []byte(frameMagic))
} }
return ErrMagicMismatch return ErrMagicMismatch
} }
@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error {
// Read Dictionary_ID // Read Dictionary_ID
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
d.DictionaryID = nil d.DictionaryID = 0
if size := fhd & 3; size != 0 { if size := fhd & 3; size != 0 {
if size == 3 { if size == 3 {
size = 4 size = 4
@ -167,7 +167,7 @@ func (d *frameDec) reset(br byteBuffer) error {
return err return err
} }
var id uint32 var id uint32
switch size { switch len(b) {
case 1: case 1:
id = uint32(b[0]) id = uint32(b[0])
case 2: case 2:
@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error {
if debugDecoder { if debugDecoder {
println("Dict size", size, "ID:", id) println("Dict size", size, "ID:", id)
} }
if id > 0 { d.DictionaryID = id
// ID 0 means "sorry, no dictionary anyway".
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
d.DictionaryID = &id
}
} }
// Read Frame_Content_Size // Read Frame_Content_Size
@ -204,7 +200,7 @@ func (d *frameDec) reset(br byteBuffer) error {
println("Reading Frame content", err) println("Reading Frame content", err)
return err return err
} }
switch fcsSize { switch len(b) {
case 1: case 1:
d.FrameContentSize = uint64(b[0]) d.FrameContentSize = uint64(b[0])
case 2: case 2:
@ -305,7 +301,7 @@ func (d *frameDec) checkCRC() error {
} }
// We can overwrite upper tmp now // We can overwrite upper tmp now
want, err := d.rawInput.readSmall(4) buf, err := d.rawInput.readSmall(4)
if err != nil { if err != nil {
println("CRC missing?", err) println("CRC missing?", err)
return err return err
@ -315,22 +311,17 @@ func (d *frameDec) checkCRC() error {
return nil return nil
} }
var tmp [4]byte want := binary.LittleEndian.Uint32(buf[:4])
got := d.crc.Sum64() got := uint32(d.crc.Sum64())
// Flip to match file order.
tmp[0] = byte(got >> 0)
tmp[1] = byte(got >> 8)
tmp[2] = byte(got >> 16)
tmp[3] = byte(got >> 24)
if !bytes.Equal(tmp[:], want) { if got != want {
if debugDecoder { if debugDecoder {
println("CRC Check Failed:", tmp[:], "!=", want) printf("CRC check failed: got %08x, want %08x\n", got, want)
} }
return ErrCRCMismatch return ErrCRCMismatch
} }
if debugDecoder { if debugDecoder {
println("CRC ok", tmp[:]) printf("CRC ok %08x\n", got)
} }
return nil return nil
} }

View File

@ -2,12 +2,7 @@
VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package. VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
xxhash is a Go implementation of the 64-bit
[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
high-quality hashing algorithm that is much faster than anything in the Go high-quality hashing algorithm that is much faster than anything in the Go
standard library. standard library.
@ -28,31 +23,49 @@ func (*Digest) WriteString(string) (int, error)
func (*Digest) Sum64() uint64 func (*Digest) Sum64() uint64
``` ```
This implementation provides a fast pure-Go implementation and an even faster The package is written with optimized pure Go and also contains even faster
assembly implementation for amd64. assembly implementations for amd64 and arm64. If desired, the `purego` build tag
opts into using the Go code even on those architectures.
[xxHash]: http://cyan4973.github.io/xxHash/
## Compatibility
This package is in a module and the latest code is in version 2 of the module.
You need a version of Go with at least "minimal module compatibility" to use
github.com/cespare/xxhash/v2:
* 1.9.7+ for Go 1.9
* 1.10.3+ for Go 1.10
* Go 1.11 or later
I recommend using the latest release of Go.
## Benchmarks ## Benchmarks
Here are some quick benchmarks comparing the pure-Go and assembly Here are some quick benchmarks comparing the pure-Go and assembly
implementations of Sum64. implementations of Sum64.
| input size | purego | asm | | input size | purego | asm |
| --- | --- | --- | | ---------- | --------- | --------- |
| 5 B | 979.66 MB/s | 1291.17 MB/s | | 4 B | 1.3 GB/s | 1.2 GB/s |
| 100 B | 7475.26 MB/s | 7973.40 MB/s | | 16 B | 2.9 GB/s | 3.5 GB/s |
| 4 KB | 17573.46 MB/s | 17602.65 MB/s | | 100 B | 6.9 GB/s | 8.1 GB/s |
| 10 MB | 17131.46 MB/s | 17142.16 MB/s | | 4 KB | 11.7 GB/s | 16.7 GB/s |
| 10 MB | 12.0 GB/s | 17.3 GB/s |
These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
the following commands under Go 1.11.2: CPU using the following commands under Go 1.19.2:
``` ```
$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes' benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
$ go test -benchtime 10s -bench '/xxhash,direct,bytes' benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
``` ```
## Projects using this package ## Projects using this package
- [InfluxDB](https://github.com/influxdata/influxdb) - [InfluxDB](https://github.com/influxdata/influxdb)
- [Prometheus](https://github.com/prometheus/prometheus) - [Prometheus](https://github.com/prometheus/prometheus)
- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
- [FreeCache](https://github.com/coocood/freecache) - [FreeCache](https://github.com/coocood/freecache)
- [FastCache](https://github.com/VictoriaMetrics/fastcache)

View File

@ -18,19 +18,11 @@ const (
prime5 uint64 = 2870177450012600261 prime5 uint64 = 2870177450012600261
) )
// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where // Store the primes in an array as well.
// possible in the Go code is worth a small (but measurable) performance boost //
// by avoiding some MOVQs. Vars are needed for the asm and also are useful for // The consts are used when possible in Go code to avoid MOVs but we need a
// convenience in the Go code in a few places where we need to intentionally // contiguous array of the assembly code.
// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
// result overflows a uint64).
var (
prime1v = prime1
prime2v = prime2
prime3v = prime3
prime4v = prime4
prime5v = prime5
)
// Digest implements hash.Hash64. // Digest implements hash.Hash64.
type Digest struct { type Digest struct {
@ -52,10 +44,10 @@ func New() *Digest {
// Reset clears the Digest's state so that it can be reused. // Reset clears the Digest's state so that it can be reused.
func (d *Digest) Reset() { func (d *Digest) Reset() {
d.v1 = prime1v + prime2 d.v1 = primes[0] + prime2
d.v2 = prime2 d.v2 = prime2
d.v3 = 0 d.v3 = 0
d.v4 = -prime1v d.v4 = -primes[0]
d.total = 0 d.total = 0
d.n = 0 d.n = 0
} }
@ -71,21 +63,23 @@ func (d *Digest) Write(b []byte) (n int, err error) {
n = len(b) n = len(b)
d.total += uint64(n) d.total += uint64(n)
memleft := d.mem[d.n&(len(d.mem)-1):]
if d.n+n < 32 { if d.n+n < 32 {
// This new data doesn't even fill the current block. // This new data doesn't even fill the current block.
copy(d.mem[d.n:], b) copy(memleft, b)
d.n += n d.n += n
return return
} }
if d.n > 0 { if d.n > 0 {
// Finish off the partial block. // Finish off the partial block.
copy(d.mem[d.n:], b) c := copy(memleft, b)
d.v1 = round(d.v1, u64(d.mem[0:8])) d.v1 = round(d.v1, u64(d.mem[0:8]))
d.v2 = round(d.v2, u64(d.mem[8:16])) d.v2 = round(d.v2, u64(d.mem[8:16]))
d.v3 = round(d.v3, u64(d.mem[16:24])) d.v3 = round(d.v3, u64(d.mem[16:24]))
d.v4 = round(d.v4, u64(d.mem[24:32])) d.v4 = round(d.v4, u64(d.mem[24:32]))
b = b[32-d.n:] b = b[c:]
d.n = 0 d.n = 0
} }
@ -135,21 +129,20 @@ func (d *Digest) Sum64() uint64 {
h += d.total h += d.total
i, end := 0, d.n b := d.mem[:d.n&(len(d.mem)-1)]
for ; i+8 <= end; i += 8 { for ; len(b) >= 8; b = b[8:] {
k1 := round(0, u64(d.mem[i:i+8])) k1 := round(0, u64(b[:8]))
h ^= k1 h ^= k1
h = rol27(h)*prime1 + prime4 h = rol27(h)*prime1 + prime4
} }
if i+4 <= end { if len(b) >= 4 {
h ^= uint64(u32(d.mem[i:i+4])) * prime1 h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3 h = rol23(h)*prime2 + prime3
i += 4 b = b[4:]
} }
for i < end { for ; len(b) > 0; b = b[1:] {
h ^= uint64(d.mem[i]) * prime5 h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1 h = rol11(h) * prime1
i++
} }
h ^= h >> 33 h ^= h >> 33

View File

@ -1,3 +1,4 @@
//go:build !appengine && gc && !purego && !noasm
// +build !appengine // +build !appengine
// +build gc // +build gc
// +build !purego // +build !purego
@ -5,212 +6,205 @@
#include "textflag.h" #include "textflag.h"
// Register allocation: // Registers:
// AX h #define h AX
// SI pointer to advance through b #define d AX
// DX n #define p SI // pointer to advance through b
// BX loop end #define n DX
// R8 v1, k1 #define end BX // loop end
// R9 v2 #define v1 R8
// R10 v3 #define v2 R9
// R11 v4 #define v3 R10
// R12 tmp #define v4 R11
// R13 prime1v #define x R12
// R14 prime2v #define prime1 R13
// DI prime4v #define prime2 R14
#define prime4 DI
// round reads from and advances the buffer pointer in SI. #define round(acc, x) \
// It assumes that R13 has prime1v and R14 has prime2v. IMULQ prime2, x \
#define round(r) \ ADDQ x, acc \
MOVQ (SI), R12 \ ROLQ $31, acc \
ADDQ $8, SI \ IMULQ prime1, acc
IMULQ R14, R12 \
ADDQ R12, r \
ROLQ $31, r \
IMULQ R13, r
// mergeRound applies a merge round on the two registers acc and val. // round0 performs the operation x = round(0, x).
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v. #define round0(x) \
#define mergeRound(acc, val) \ IMULQ prime2, x \
IMULQ R14, val \ ROLQ $31, x \
ROLQ $31, val \ IMULQ prime1, x
IMULQ R13, val \
XORQ val, acc \ // mergeRound applies a merge round on the two registers acc and x.
IMULQ R13, acc \ // It assumes that prime1, prime2, and prime4 have been loaded.
ADDQ DI, acc #define mergeRound(acc, x) \
round0(x) \
XORQ x, acc \
IMULQ prime1, acc \
ADDQ prime4, acc
// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that there is at least one block
// to process.
#define blockLoop() \
loop: \
MOVQ +0(p), x \
round(v1, x) \
MOVQ +8(p), x \
round(v2, x) \
MOVQ +16(p), x \
round(v3, x) \
MOVQ +24(p), x \
round(v4, x) \
ADDQ $32, p \
CMPQ p, end \
JLE loop
// func Sum64(b []byte) uint64 // func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT, $0-32 TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
// Load fixed primes. // Load fixed primes.
MOVQ ·prime1v(SB), R13 MOVQ ·primes+0(SB), prime1
MOVQ ·prime2v(SB), R14 MOVQ ·primes+8(SB), prime2
MOVQ ·prime4v(SB), DI MOVQ ·primes+24(SB), prime4
// Load slice. // Load slice.
MOVQ b_base+0(FP), SI MOVQ b_base+0(FP), p
MOVQ b_len+8(FP), DX MOVQ b_len+8(FP), n
LEAQ (SI)(DX*1), BX LEAQ (p)(n*1), end
// The first loop limit will be len(b)-32. // The first loop limit will be len(b)-32.
SUBQ $32, BX SUBQ $32, end
// Check whether we have at least one block. // Check whether we have at least one block.
CMPQ DX, $32 CMPQ n, $32
JLT noBlocks JLT noBlocks
// Set up initial state (v1, v2, v3, v4). // Set up initial state (v1, v2, v3, v4).
MOVQ R13, R8 MOVQ prime1, v1
ADDQ R14, R8 ADDQ prime2, v1
MOVQ R14, R9 MOVQ prime2, v2
XORQ R10, R10 XORQ v3, v3
XORQ R11, R11 XORQ v4, v4
SUBQ R13, R11 SUBQ prime1, v4
// Loop until SI > BX. blockLoop()
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
CMPQ SI, BX MOVQ v1, h
JLE blockLoop ROLQ $1, h
MOVQ v2, x
ROLQ $7, x
ADDQ x, h
MOVQ v3, x
ROLQ $12, x
ADDQ x, h
MOVQ v4, x
ROLQ $18, x
ADDQ x, h
MOVQ R8, AX mergeRound(h, v1)
ROLQ $1, AX mergeRound(h, v2)
MOVQ R9, R12 mergeRound(h, v3)
ROLQ $7, R12 mergeRound(h, v4)
ADDQ R12, AX
MOVQ R10, R12
ROLQ $12, R12
ADDQ R12, AX
MOVQ R11, R12
ROLQ $18, R12
ADDQ R12, AX
mergeRound(AX, R8)
mergeRound(AX, R9)
mergeRound(AX, R10)
mergeRound(AX, R11)
JMP afterBlocks JMP afterBlocks
noBlocks: noBlocks:
MOVQ ·prime5v(SB), AX MOVQ ·primes+32(SB), h
afterBlocks: afterBlocks:
ADDQ DX, AX ADDQ n, h
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. ADDQ $24, end
ADDQ $24, BX CMPQ p, end
JG try4
CMPQ SI, BX loop8:
JG fourByte MOVQ (p), x
ADDQ $8, p
round0(x)
XORQ x, h
ROLQ $27, h
IMULQ prime1, h
ADDQ prime4, h
wordLoop: CMPQ p, end
// Calculate k1. JLE loop8
MOVQ (SI), R8
ADDQ $8, SI
IMULQ R14, R8
ROLQ $31, R8
IMULQ R13, R8
XORQ R8, AX try4:
ROLQ $27, AX ADDQ $4, end
IMULQ R13, AX CMPQ p, end
ADDQ DI, AX JG try1
CMPQ SI, BX MOVL (p), x
JLE wordLoop ADDQ $4, p
IMULQ prime1, x
XORQ x, h
fourByte: ROLQ $23, h
ADDQ $4, BX IMULQ prime2, h
CMPQ SI, BX ADDQ ·primes+16(SB), h
JG singles
MOVL (SI), R8 try1:
ADDQ $4, SI ADDQ $4, end
IMULQ R13, R8 CMPQ p, end
XORQ R8, AX
ROLQ $23, AX
IMULQ R14, AX
ADDQ ·prime3v(SB), AX
singles:
ADDQ $4, BX
CMPQ SI, BX
JGE finalize JGE finalize
singlesLoop: loop1:
MOVBQZX (SI), R12 MOVBQZX (p), x
ADDQ $1, SI ADDQ $1, p
IMULQ ·prime5v(SB), R12 IMULQ ·primes+32(SB), x
XORQ R12, AX XORQ x, h
ROLQ $11, h
IMULQ prime1, h
ROLQ $11, AX CMPQ p, end
IMULQ R13, AX JL loop1
CMPQ SI, BX
JL singlesLoop
finalize: finalize:
MOVQ AX, R12 MOVQ h, x
SHRQ $33, R12 SHRQ $33, x
XORQ R12, AX XORQ x, h
IMULQ R14, AX IMULQ prime2, h
MOVQ AX, R12 MOVQ h, x
SHRQ $29, R12 SHRQ $29, x
XORQ R12, AX XORQ x, h
IMULQ ·prime3v(SB), AX IMULQ ·primes+16(SB), h
MOVQ AX, R12 MOVQ h, x
SHRQ $32, R12 SHRQ $32, x
XORQ R12, AX XORQ x, h
MOVQ AX, ret+24(FP) MOVQ h, ret+24(FP)
RET RET
// writeBlocks uses the same registers as above except that it uses AX to store
// the d pointer.
// func writeBlocks(d *Digest, b []byte) int // func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT, $0-40 TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// Load fixed primes needed for round. // Load fixed primes needed for round.
MOVQ ·prime1v(SB), R13 MOVQ ·primes+0(SB), prime1
MOVQ ·prime2v(SB), R14 MOVQ ·primes+8(SB), prime2
// Load slice. // Load slice.
MOVQ b_base+8(FP), SI MOVQ b_base+8(FP), p
MOVQ b_len+16(FP), DX MOVQ b_len+16(FP), n
LEAQ (SI)(DX*1), BX LEAQ (p)(n*1), end
SUBQ $32, BX SUBQ $32, end
// Load vN from d. // Load vN from d.
MOVQ d+0(FP), AX MOVQ s+0(FP), d
MOVQ 0(AX), R8 // v1 MOVQ 0(d), v1
MOVQ 8(AX), R9 // v2 MOVQ 8(d), v2
MOVQ 16(AX), R10 // v3 MOVQ 16(d), v3
MOVQ 24(AX), R11 // v4 MOVQ 24(d), v4
// We don't need to check the loop condition here; this function is // We don't need to check the loop condition here; this function is
// always called with at least one block of data to process. // always called with at least one block of data to process.
blockLoop: blockLoop()
round(R8)
round(R9)
round(R10)
round(R11)
CMPQ SI, BX
JLE blockLoop
// Copy vN back to d. // Copy vN back to d.
MOVQ R8, 0(AX) MOVQ v1, 0(d)
MOVQ R9, 8(AX) MOVQ v2, 8(d)
MOVQ R10, 16(AX) MOVQ v3, 16(d)
MOVQ R11, 24(AX) MOVQ v4, 24(d)
// The number of bytes written is SI minus the old base pointer. // The number of bytes written is p minus the old base pointer.
SUBQ b_base+8(FP), SI SUBQ b_base+8(FP), p
MOVQ SI, ret+32(FP) MOVQ p, ret+32(FP)
RET RET

View File

@ -1,13 +1,17 @@
// +build gc,!purego,!noasm //go:build !appengine && gc && !purego && !noasm
// +build !appengine
// +build gc
// +build !purego
// +build !noasm
#include "textflag.h" #include "textflag.h"
// Register allocation. // Registers:
#define digest R1 #define digest R1
#define h R2 // Return value. #define h R2 // return value
#define p R3 // Input pointer. #define p R3 // input pointer
#define len R4 #define n R4 // input length
#define nblocks R5 // len / 32. #define nblocks R5 // n / 32
#define prime1 R7 #define prime1 R7
#define prime2 R8 #define prime2 R8
#define prime3 R9 #define prime3 R9
@ -25,60 +29,52 @@
#define round(acc, x) \ #define round(acc, x) \
MADD prime2, acc, x, acc \ MADD prime2, acc, x, acc \
ROR $64-31, acc \ ROR $64-31, acc \
MUL prime1, acc \ MUL prime1, acc
// x = round(0, x). // round0 performs the operation x = round(0, x).
#define round0(x) \ #define round0(x) \
MUL prime2, x \ MUL prime2, x \
ROR $64-31, x \ ROR $64-31, x \
MUL prime1, x \ MUL prime1, x
#define mergeRound(x) \ #define mergeRound(acc, x) \
round0(x) \ round0(x) \
EOR x, h \ EOR x, acc \
MADD h, prime4, prime1, h \ MADD acc, prime4, prime1, acc
// Update v[1-4] with 32-byte blocks. Assumes len >= 32. // blockLoop processes as many 32-byte blocks as possible,
#define blocksLoop() \ // updating v1, v2, v3, and v4. It assumes that n >= 32.
LSR $5, len, nblocks \ #define blockLoop() \
PCALIGN $16 \ LSR $5, n, nblocks \
loop: \ PCALIGN $16 \
LDP.P 32(p), (x1, x2) \ loop: \
round(v1, x1) \ LDP.P 16(p), (x1, x2) \
LDP -16(p), (x3, x4) \ LDP.P 16(p), (x3, x4) \
round(v2, x2) \ round(v1, x1) \
SUB $1, nblocks \ round(v2, x2) \
round(v3, x3) \ round(v3, x3) \
round(v4, x4) \ round(v4, x4) \
CBNZ nblocks, loop \ SUB $1, nblocks \
CBNZ nblocks, loop
// The primes are repeated here to ensure that they're stored
// in a contiguous array, so we can load them with LDP.
DATA primes<> +0(SB)/8, $11400714785074694791
DATA primes<> +8(SB)/8, $14029467366897019727
DATA primes<>+16(SB)/8, $1609587929392839161
DATA primes<>+24(SB)/8, $9650029242287828579
DATA primes<>+32(SB)/8, $2870177450012600261
GLOBL primes<>(SB), NOPTR+RODATA, $40
// func Sum64(b []byte) uint64 // func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
LDP b_base+0(FP), (p, len) LDP b_base+0(FP), (p, n)
LDP primes<> +0(SB), (prime1, prime2) LDP ·primes+0(SB), (prime1, prime2)
LDP primes<>+16(SB), (prime3, prime4) LDP ·primes+16(SB), (prime3, prime4)
MOVD primes<>+32(SB), prime5 MOVD ·primes+32(SB), prime5
CMP $32, len CMP $32, n
CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
BLO afterLoop BLT afterLoop
ADD prime1, prime2, v1 ADD prime1, prime2, v1
MOVD prime2, v2 MOVD prime2, v2
MOVD $0, v3 MOVD $0, v3
NEG prime1, v4 NEG prime1, v4
blocksLoop() blockLoop()
ROR $64-1, v1, x1 ROR $64-1, v1, x1
ROR $64-7, v2, x2 ROR $64-7, v2, x2
@ -88,71 +84,75 @@ TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
ADD x3, x4 ADD x3, x4
ADD x2, x4, h ADD x2, x4, h
mergeRound(v1) mergeRound(h, v1)
mergeRound(v2) mergeRound(h, v2)
mergeRound(v3) mergeRound(h, v3)
mergeRound(v4) mergeRound(h, v4)
afterLoop: afterLoop:
ADD len, h ADD n, h
TBZ $4, len, try8 TBZ $4, n, try8
LDP.P 16(p), (x1, x2) LDP.P 16(p), (x1, x2)
round0(x1) round0(x1)
// NOTE: here and below, sequencing the EOR after the ROR (using a
// rotated register) is worth a small but measurable speedup for small
// inputs.
ROR $64-27, h ROR $64-27, h
EOR x1 @> 64-27, h, h EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h MADD h, prime4, prime1, h
round0(x2) round0(x2)
ROR $64-27, h ROR $64-27, h
EOR x2 @> 64-27, h EOR x2 @> 64-27, h, h
MADD h, prime4, prime1, h MADD h, prime4, prime1, h
try8: try8:
TBZ $3, len, try4 TBZ $3, n, try4
MOVD.P 8(p), x1 MOVD.P 8(p), x1
round0(x1) round0(x1)
ROR $64-27, h ROR $64-27, h
EOR x1 @> 64-27, h EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h MADD h, prime4, prime1, h
try4: try4:
TBZ $2, len, try2 TBZ $2, n, try2
MOVWU.P 4(p), x2 MOVWU.P 4(p), x2
MUL prime1, x2 MUL prime1, x2
ROR $64-23, h ROR $64-23, h
EOR x2 @> 64-23, h EOR x2 @> 64-23, h, h
MADD h, prime3, prime2, h MADD h, prime3, prime2, h
try2: try2:
TBZ $1, len, try1 TBZ $1, n, try1
MOVHU.P 2(p), x3 MOVHU.P 2(p), x3
AND $255, x3, x1 AND $255, x3, x1
LSR $8, x3, x2 LSR $8, x3, x2
MUL prime5, x1 MUL prime5, x1
ROR $64-11, h ROR $64-11, h
EOR x1 @> 64-11, h EOR x1 @> 64-11, h, h
MUL prime1, h MUL prime1, h
MUL prime5, x2 MUL prime5, x2
ROR $64-11, h ROR $64-11, h
EOR x2 @> 64-11, h EOR x2 @> 64-11, h, h
MUL prime1, h MUL prime1, h
try1: try1:
TBZ $0, len, end TBZ $0, n, finalize
MOVBU (p), x4 MOVBU (p), x4
MUL prime5, x4 MUL prime5, x4
ROR $64-11, h ROR $64-11, h
EOR x4 @> 64-11, h EOR x4 @> 64-11, h, h
MUL prime1, h MUL prime1, h
end: finalize:
EOR h >> 33, h EOR h >> 33, h
MUL prime2, h MUL prime2, h
EOR h >> 29, h EOR h >> 29, h
@ -163,24 +163,22 @@ end:
RET RET
// func writeBlocks(d *Digest, b []byte) int // func writeBlocks(d *Digest, b []byte) int
// TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// Assumes len(b) >= 32. LDP ·primes+0(SB), (prime1, prime2)
TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
LDP primes<>(SB), (prime1, prime2)
// Load state. Assume v[1-4] are stored contiguously. // Load state. Assume v[1-4] are stored contiguously.
MOVD d+0(FP), digest MOVD d+0(FP), digest
LDP 0(digest), (v1, v2) LDP 0(digest), (v1, v2)
LDP 16(digest), (v3, v4) LDP 16(digest), (v3, v4)
LDP b_base+8(FP), (p, len) LDP b_base+8(FP), (p, n)
blocksLoop() blockLoop()
// Store updated state. // Store updated state.
STP (v1, v2), 0(digest) STP (v1, v2), 0(digest)
STP (v3, v4), 16(digest) STP (v3, v4), 16(digest)
BIC $31, len BIC $31, n
MOVD len, ret+32(FP) MOVD n, ret+32(FP)
RET RET

View File

@ -13,4 +13,4 @@ package xxhash
func Sum64(b []byte) uint64 func Sum64(b []byte) uint64
//go:noescape //go:noescape
func writeBlocks(d *Digest, b []byte) int func writeBlocks(s *Digest, b []byte) int

View File

@ -15,10 +15,10 @@ func Sum64(b []byte) uint64 {
var h uint64 var h uint64
if n >= 32 { if n >= 32 {
v1 := prime1v + prime2 v1 := primes[0] + prime2
v2 := prime2 v2 := prime2
v3 := uint64(0) v3 := uint64(0)
v4 := -prime1v v4 := -primes[0]
for len(b) >= 32 { for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)])) v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)])) v2 = round(v2, u64(b[8:16:len(b)]))
@ -37,19 +37,18 @@ func Sum64(b []byte) uint64 {
h += uint64(n) h += uint64(n)
i, end := 0, len(b) for ; len(b) >= 8; b = b[8:] {
for ; i+8 <= end; i += 8 { k1 := round(0, u64(b[:8]))
k1 := round(0, u64(b[i:i+8:len(b)]))
h ^= k1 h ^= k1
h = rol27(h)*prime1 + prime4 h = rol27(h)*prime1 + prime4
} }
if i+4 <= end { if len(b) >= 4 {
h ^= uint64(u32(b[i:i+4:len(b)])) * prime1 h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3 h = rol23(h)*prime2 + prime3
i += 4 b = b[4:]
} }
for ; i < end; i++ { for ; len(b) > 0; b = b[1:] {
h ^= uint64(b[i]) * prime5 h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1 h = rol11(h) * prime1
} }

View File

@ -314,9 +314,6 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
} }
size := ll + ml + len(out) size := ll + ml + len(out)
if size-startSize > maxBlockSize { if size-startSize > maxBlockSize {
if size-startSize == 424242 {
panic("here")
}
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
} }
if size > cap(out) { if size > cap(out) {
@ -427,8 +424,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
} }
} }
// Check if space for literals if size := len(s.literals) + len(out) - startSize; size > maxBlockSize {
if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
} }

View File

@ -148,7 +148,6 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
s.seqSize += ctx.litRemain s.seqSize += ctx.litRemain
if s.seqSize > maxBlockSize { if s.seqSize > maxBlockSize {
return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
} }
err := br.close() err := br.close()
if err != nil { if err != nil {

View File

@ -320,10 +320,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP) MOVQ $0x00000004, ret+24(FP)
RET RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV // Requires: CMOV
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
@ -617,10 +613,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP) MOVQ $0x00000004, ret+24(FP)
RET RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV // Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
@ -897,10 +889,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP) MOVQ $0x00000004, ret+24(FP)
RET RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV // Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
@ -1152,10 +1140,6 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP) MOVQ $0x00000004, ret+24(FP)
RET RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Requires: SSE // Requires: SSE
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
@ -1389,8 +1373,7 @@ loop_finished:
MOVQ ctx+0(FP), AX MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX) MOVQ DX, 24(AX)
MOVQ DI, 104(AX) MOVQ DI, 104(AX)
MOVQ 80(AX), CX SUBQ 80(AX), SI
SUBQ CX, SI
MOVQ SI, 112(AX) MOVQ SI, 112(AX)
RET RET
@ -1402,8 +1385,7 @@ error_match_off_too_big:
MOVQ ctx+0(FP), AX MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX) MOVQ DX, 24(AX)
MOVQ DI, 104(AX) MOVQ DI, 104(AX)
MOVQ 80(AX), CX SUBQ 80(AX), SI
SUBQ CX, SI
MOVQ SI, 112(AX) MOVQ SI, 112(AX)
RET RET
@ -1747,8 +1729,7 @@ loop_finished:
MOVQ ctx+0(FP), AX MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX) MOVQ DX, 24(AX)
MOVQ DI, 104(AX) MOVQ DI, 104(AX)
MOVQ 80(AX), CX SUBQ 80(AX), SI
SUBQ CX, SI
MOVQ SI, 112(AX) MOVQ SI, 112(AX)
RET RET
@ -1760,8 +1741,7 @@ error_match_off_too_big:
MOVQ ctx+0(FP), AX MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX) MOVQ DX, 24(AX)
MOVQ DI, 104(AX) MOVQ DI, 104(AX)
MOVQ 80(AX), CX SUBQ 80(AX), SI
SUBQ CX, SI
MOVQ SI, 112(AX) MOVQ SI, 112(AX)
RET RET

View File

@ -36,9 +36,6 @@ const forcePreDef = false
// zstdMinMatch is the minimum zstd match length. // zstdMinMatch is the minimum zstd match length.
const zstdMinMatch = 3 const zstdMinMatch = 3
// Reset the buffer offset when reaching this.
const bufferReset = math.MaxInt32 - MaxWindowSize
// fcsUnknown is used for unknown frame content size. // fcsUnknown is used for unknown frame content size.
const fcsUnknown = math.MaxUint64 const fcsUnknown = math.MaxUint64
@ -75,7 +72,6 @@ var (
ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit") ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
// ErrUnknownDictionary is returned if the dictionary ID is unknown. // ErrUnknownDictionary is returned if the dictionary ID is unknown.
// For the time being dictionaries are not supported.
ErrUnknownDictionary = errors.New("unknown dictionary") ErrUnknownDictionary = errors.New("unknown dictionary")
// ErrFrameSizeExceeded is returned if the stated frame size is exceeded. // ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
@ -110,26 +106,25 @@ func printf(format string, a ...interface{}) {
} }
} }
// matchLen returns the maximum length. // matchLen returns the maximum common prefix length of a and b.
// a must be the shortest of the two. // a must be the shortest of the two.
// The function also returns whether all bytes matched. func matchLen(a, b []byte) (n int) {
func matchLen(a, b []byte) int { for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
b = b[:len(a)] diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
for i := 0; i < len(a)-7; i += 8 { if diff != 0 {
if diff := load64(a, i) ^ load64(b, i); diff != 0 { return n + bits.TrailingZeros64(diff)>>3
return i + (bits.TrailingZeros64(diff) >> 3)
} }
n += 8
} }
checked := (len(a) >> 3) << 3
a = a[checked:]
b = b[checked:]
for i := range a { for i := range a {
if a[i] != b[i] { if a[i] != b[i] {
return i + checked break
} }
n++
} }
return len(a) + checked return n
} }
func load3232(b []byte, i int32) uint32 { func load3232(b []byte, i int32) uint32 {
@ -140,10 +135,6 @@ func load6432(b []byte, i int32) uint64 {
return binary.LittleEndian.Uint64(b[i:]) return binary.LittleEndian.Uint64(b[i:])
} }
func load64(b []byte, i int) uint64 {
return binary.LittleEndian.Uint64(b[i:])
}
type byter interface { type byter interface {
Bytes() []byte Bytes() []byte
Len() int Len() int

4
vendor/modules.txt vendored
View File

@ -136,8 +136,8 @@ github.com/imdario/mergo
# github.com/inconshreveable/mousetrap v1.1.0 # github.com/inconshreveable/mousetrap v1.1.0
## explicit; go 1.18 ## explicit; go 1.18
github.com/inconshreveable/mousetrap github.com/inconshreveable/mousetrap
# github.com/klauspost/compress v1.15.12 # github.com/klauspost/compress v1.16.3
## explicit; go 1.17 ## explicit; go 1.18
github.com/klauspost/compress github.com/klauspost/compress
github.com/klauspost/compress/fse github.com/klauspost/compress/fse
github.com/klauspost/compress/huff0 github.com/klauspost/compress/huff0