vendor: github.com/klauspost/compress v1.15.9

various fixes in zstd compression - https://github.com/klauspost/compress/releases/tag/v1.15.9 - https://github.com/klauspost/compress/releases/tag/v1.15.8 - https://github.com/klauspost/compress/releases/tag/v1.15.7 - https://github.com/klauspost/compress/releases/tag/v1.15.6 - https://github.com/klauspost/compress/releases/tag/v1.15.5 - https://github.com/klauspost/compress/releases/tag/v1.15.4 - https://github.com/klauspost/compress/releases/tag/v1.15.3 - https://github.com/klauspost/compress/releases/tag/v1.15.2 full diff: https://github.com/klauspost/compress/compare/v1.15.1...v1.15.9 Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-09-10 18:50:02 +02:00 · 2022-09-10 18:50:02 +02:00 · 8771b956ea
parent 813bd79471
commit 8771b956ea
47 changed files with 6415 additions and 2307 deletions
--- a/vendor.mod
+++ b/vendor.mod
@ -54,7 +54,7 @@ require (
 	github.com/golang/protobuf v1.5.2 // indirect
 	github.com/gorilla/mux v1.8.0 // indirect; updated to v1.8.0 to get rid of old compatibility for "context"
 	github.com/inconshreveable/mousetrap v1.0.0 // indirect
-	github.com/klauspost/compress v1.15.1 // indirect
+	github.com/klauspost/compress v1.15.9 // indirect
 	github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
 	github.com/miekg/pkcs11 v1.1.1 // indirect
 	github.com/moby/sys/symlink v0.2.0 // indirect
--- a/vendor.sum
+++ b/vendor.sum
@ -250,8 +250,8 @@ github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/klauspost/compress v1.15.1 h1:y9FcTHGyrebwfP0ZZqFiaxTaiDnUrGkJkI+f583BL1A=
+github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
-github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
+github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
--- a/vendor/github.com/klauspost/compress/.gitignore
+++ b/vendor/github.com/klauspost/compress/.gitignore
@ -23,3 +23,10 @@ _testmain.go
 *.test
 *.prof
 /s2/cmd/_s2sx/sfx-exe
 # Linux perf files
 perf.data
 perf.data.old
 # gdb history
 .gdb_history
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@ -17,6 +17,72 @@ This package provides various compression algorithms.
 # changelog
 * July 13, 2022 (v1.15.8)
 	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
 	* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
 	* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
 	* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
 	* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
 	* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
 	* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
 * June 29, 2022 (v1.15.7)
 	* s2: Fix absolute forward seeks  https://github.com/klauspost/compress/pull/633
 	* zip: Merge upstream  https://github.com/klauspost/compress/pull/631
 	* zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
 	* zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
 	* flate: Faster histograms  https://github.com/klauspost/compress/pull/620
 	* deflate: Use compound hcode  https://github.com/klauspost/compress/pull/622
 * June 3, 2022 (v1.15.6)
 	* s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
 	* s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
 	* zstd: Always use configured block size https://github.com/klauspost/compress/pull/605
 	* zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606
 	* zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608
 	* gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612
 	* s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609
 	* s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607
 	* snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614
 	* s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610
 * May 25, 2022 (v1.15.5)
 	* s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
 	* s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
 	* huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596
 	* zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588
 	* zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592
 	* zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599
 	* zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593
 	* huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586
 	* flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590
 * May 11, 2022 (v1.15.4)
 	* huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577)
 	* inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581)
 	* zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583)
 	* zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580)
 * May 5, 2022 (v1.15.3)
 	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
 	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
 * Apr 26, 2022 (v1.15.2)
 	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
 	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
 	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
 	* Minimum version is Go 1.16, added CI test on 1.18.
 * Mar 11, 2022 (v1.15.1)
 	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
 	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
 	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
 	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
 	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
 * Mar 3, 2022 (v1.15.0)
 	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
 	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
@ -60,6 +126,9 @@ While the release has been extensively tested, it is recommended to testing when
 	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
 	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
 <details>
 	<summary>See changes to v1.13.x</summary>
 * Aug 30, 2021 (v1.13.5)
 	* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
 	* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
@ -88,6 +157,8 @@ While the release has been extensively tested, it is recommended to testing when
 	* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.
 	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
 	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
 </details>
 <details>
 	<summary>See changes to v1.12.x</summary>
--- a/vendor/github.com/klauspost/compress/huff0/autogen.go
+++ b/vendor/github.com/klauspost/compress/huff0/autogen.go
@ -1,5 +0,0 @@
 package huff0
 //go:generate go run generate.go
 //go:generate asmfmt -w decompress_amd64.s
 //go:generate asmfmt -w decompress_8b_amd64.s
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@ -165,11 +165,6 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 	return uint16(b.value >> ((64 - n) & 63))
 }
 // peekTopBits(n) is equvialent to peekBitFast(64 - n)
 func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
 	return uint16(b.value >> n)
 }
 func (b *bitReaderShifted) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
@ -220,11 +215,6 @@ func (b *bitReaderShifted) fill() {
 	}
 }
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReaderShifted) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 func (b *bitReaderShifted) remaining() uint {
 	return b.off*8 + uint(64-b.bitsRead)
 }
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@ -5,8 +5,6 @@
 package huff0
 import "fmt"
 // bitWriter will write bits.
 // First bit will be LSB of the first byte of output.
 type bitWriter struct {
@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
 	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
 	0xFFFF, 0xFFFF} /* up to 16 bits */
 // addBits16NC will add up to 16 bits.
 // It will not check if there is space for them,
 // so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
 	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
 	b.nBits += bits
 }
 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
 	b.nBits += encA.nBits + encB.nBits
 }
 // addBits16ZeroNC will add up to 16 bits.
 // It will not check if there is space for them,
 // so the caller must ensure that it has flushed recently.
 // This is fastest if bits can be zero.
 func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
 	if bits == 0 {
 		return
 	}
 	value <<= (16 - bits) & 15
 	value >>= (16 - bits) & 15
 	b.bitContainer |= uint64(value) << (b.nBits & 63)
 	b.nBits += bits
 }
 // flush will flush all pending full bytes.
 // There will be at least 56 bits available for writing when this has been called.
 // Using flush32 is faster, but leaves less space for writing.
 func (b *bitWriter) flush() {
 	v := b.nBits >> 3
 	switch v {
 	case 0:
 		return
 	case 1:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 		)
 		b.bitContainer >>= 1 << 3
 	case 2:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 		)
 		b.bitContainer >>= 2 << 3
 	case 3:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 		)
 		b.bitContainer >>= 3 << 3
 	case 4:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 		)
 		b.bitContainer >>= 4 << 3
 	case 5:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 			byte(b.bitContainer>>32),
 		)
 		b.bitContainer >>= 5 << 3
 	case 6:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 			byte(b.bitContainer>>32),
 			byte(b.bitContainer>>40),
 		)
 		b.bitContainer >>= 6 << 3
 	case 7:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 			byte(b.bitContainer>>32),
 			byte(b.bitContainer>>40),
 			byte(b.bitContainer>>48),
 		)
 		b.bitContainer >>= 7 << 3
 	case 8:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 			byte(b.bitContainer>>32),
 			byte(b.bitContainer>>40),
 			byte(b.bitContainer>>48),
 			byte(b.bitContainer>>56),
 		)
 		b.bitContainer = 0
 		b.nBits = 0
 		return
 	default:
 		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
 	}
 	b.nBits &= 7
 }
 // flush32 will flush out, so there are at least 32 bits available for writing.
 func (b *bitWriter) flush32() {
 	if b.nBits < 32 {
@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
 	b.flushAlign()
 	return nil
 }
 // reset and continue writing by appending to out.
 func (b *bitWriter) reset(out []byte) {
 	b.bitContainer = 0
 	b.nBits = 0
 	b.out = out
 }
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bytereader.go
@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
 	b.off = 0
 }
 // advance the stream b n bytes.
 func (b *byteReader) advance(n uint) {
 	b.off += int(n)
 }
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
 	v3 := int32(b.b[b.off+3])
@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
 	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
 }
 // unread returns the unread portion of the input.
 func (b byteReader) unread() []byte {
 	return b.b[b.off:]
 }
 // remain will return the number of bytes remaining.
 func (b byteReader) remain() int {
 	return len(b.b) - b.off
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@ -404,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
 	return true
 }
 //lint:ignore U1000 used for debugging
 func (s *Scratch) validateTable(c cTable) bool {
 	if len(c) < int(s.symbolLen) {
 		return false
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@ -11,7 +11,6 @@ import (
 type dTable struct {
 	single []dEntrySingle
 	double []dEntryDouble
 }
 // single-symbols decoding
@ -19,13 +18,6 @@ type dEntrySingle struct {
 	entry uint16
 }
 // double-symbols decoding
 type dEntryDouble struct {
 	seq   [4]byte
 	nBits uint8
 	len   uint8
 }
 // Uses special code for all tables that are < 8 bits.
 const use8BitTables = true
@ -35,7 +27,7 @@ const use8BitTables = true
 // If no Scratch is provided a new one is allocated.
 // The returned Scratch can be used for encoding or decoding input using this table.
 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
-	s, err = s.prepare(in)
+	s, err = s.prepare(nil)
 	if err != nil {
 		return s, nil, err
 	}
@ -236,108 +228,6 @@ func (d *Decoder) buffer() *[4][256]byte {
 	return &[4][256]byte{}
 }
 // Decompress1X will decompress a 1X encoded stream.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
 func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
 	if use8BitTables && d.actualTableLog <= 8 {
 		return d.decompress1X8Bit(dst, src)
 	}
 	var br bitReaderShifted
 	err := br.init(src)
 	if err != nil {
 		return dst, err
 	}
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]
 	// Avoid bounds check by always having full sized table.
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
 	dt := d.dt.single[:tlSize]
 	// Use temp table to avoid bound checks/append penalty.
 	bufs := d.buffer()
 	buf := &bufs[0]
 	var off uint8
 	for br.off >= 8 {
 		br.fillFast()
 		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)
 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)
 		// Refill
 		br.fillFast()
 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)
 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
 				d.bufs.Put(bufs)
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
 		}
 	}
 	if len(dst)+int(off) > maxDecodedSize {
 		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
 	dst = append(dst, buf[:off]...)
 	// br < 8, so uint8 is fine
 	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
 	for bitsLeft > 0 {
 		br.fill()
 		if false && br.bitsRead >= 32 {
 			if br.off >= 4 {
 				v := br.in[br.off-4:]
 				v = v[:4]
 				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 				br.value = (br.value << 32) | uint64(low)
 				br.bitsRead -= 32
 				br.off -= 4
 			} else {
 				for br.off > 0 {
 					br.value = (br.value << 8) | uint64(br.in[br.off-1])
 					br.bitsRead -= 8
 					br.off--
 				}
 			}
 		}
 		if len(dst) >= maxDecodedSize {
 			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= nBits
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
@ -995,7 +885,6 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 	const shift = 56
 	const tlSize = 1 << 8
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 	// Use temp table to avoid bound checks/append penalty.
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
@ -1,488 +0,0 @@
 // +build !appengine
 // +build gc
 // +build !noasm
 #include "textflag.h"
 #include "funcdata.h"
 #include "go_asm.h"
 #define bufoff      256 // see decompress.go, we're using [4][256]byte table
 // func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
 //	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
 TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
 #define off             R8
 #define buffer          DI
 #define table           SI
 #define br_bits_read    R9
 #define br_value        R10
 #define br_offset       R11
 #define peek_bits       R12
 #define exhausted       DX
 #define br0             R13
 #define br1             R14
 #define br2             R15
 #define br3             BP
 	MOVQ BP, 0(SP)
 	XORQ exhausted, exhausted // exhausted = false
 	XORQ off, off             // off = 0
 	MOVBQZX peekBits+32(FP), peek_bits
 	MOVQ    buf+40(FP), buffer
 	MOVQ    tbl+48(FP), table
 	MOVQ pbr0+0(FP), br0
 	MOVQ pbr1+8(FP), br1
 	MOVQ pbr2+16(FP), br2
 	MOVQ pbr3+24(FP), br3
 main_loop:
 	// const stream = 0
 	// br0.fillFast()
 	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
 	MOVQ    bitReaderShifted_value(br0), br_value
 	MOVQ    bitReaderShifted_off(br0), br_offset
 	// if b.bitsRead >= 32 {
 	CMPQ br_bits_read, $32
 	JB   skip_fill0
 	SUBQ $32, br_bits_read // b.bitsRead -= 32
 	SUBQ $4, br_offset     // b.off -= 4
 	// v := b.in[b.off-4 : b.off]
 	// v = v[:4]
 	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	MOVQ bitReaderShifted_in(br0), AX
 	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
 	// b.value |= uint64(low) << (b.bitsRead & 63)
 	MOVQ br_bits_read, CX
 	SHLQ CL, AX
 	ORQ  AX, br_value
 	// exhausted = exhausted || (br0.off < 4)
 	CMPQ  br_offset, $4
 	SETLT DL
 	ORB   DL, DH
 	// }
 skip_fill0:
 	// val0 := br0.peekTopBits(peekBits)
 	MOVQ br_value, AX
 	MOVQ peek_bits, CX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v0 := table[val0&mask]
 	MOVW 0(table)(AX*2), AX // AX - v0
 	// br0.advance(uint8(v0.entry))
 	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CL, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// val1 := br0.peekTopBits(peekBits)
 	MOVQ peek_bits, CX
 	MOVQ br_value, AX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v1 := table[val1&mask]
 	MOVW 0(table)(AX*2), AX // AX - v1
 	// br0.advance(uint8(v1.entry))
 	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CX, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// these two writes get coalesced
 	// buf[stream][off] = uint8(v0.entry >> 8)
 	// buf[stream][off+1] = uint8(v1.entry >> 8)
 	MOVW BX, 0(buffer)(off*1)
 	// SECOND PART:
 	// val2 := br0.peekTopBits(peekBits)
 	MOVQ br_value, AX
 	MOVQ peek_bits, CX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v2 := table[val0&mask]
 	MOVW 0(table)(AX*2), AX // AX - v0
 	// br0.advance(uint8(v0.entry))
 	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CL, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// val3 := br0.peekTopBits(peekBits)
 	MOVQ peek_bits, CX
 	MOVQ br_value, AX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v3 := table[val1&mask]
 	MOVW 0(table)(AX*2), AX // AX - v1
 	// br0.advance(uint8(v1.entry))
 	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CX, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// these two writes get coalesced
 	// buf[stream][off+2] = uint8(v2.entry >> 8)
 	// buf[stream][off+3] = uint8(v3.entry >> 8)
 	MOVW BX, 0+2(buffer)(off*1)
 	// update the bitrader reader structure
 	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
 	MOVQ br_value, bitReaderShifted_value(br0)
 	MOVQ br_offset, bitReaderShifted_off(br0)
 	// const stream = 1
 	// br1.fillFast()
 	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
 	MOVQ    bitReaderShifted_value(br1), br_value
 	MOVQ    bitReaderShifted_off(br1), br_offset
 	// if b.bitsRead >= 32 {
 	CMPQ br_bits_read, $32
 	JB   skip_fill1
 	SUBQ $32, br_bits_read // b.bitsRead -= 32
 	SUBQ $4, br_offset     // b.off -= 4
 	// v := b.in[b.off-4 : b.off]
 	// v = v[:4]
 	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	MOVQ bitReaderShifted_in(br1), AX
 	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
 	// b.value |= uint64(low) << (b.bitsRead & 63)
 	MOVQ br_bits_read, CX
 	SHLQ CL, AX
 	ORQ  AX, br_value
 	// exhausted = exhausted || (br1.off < 4)
 	CMPQ  br_offset, $4
 	SETLT DL
 	ORB   DL, DH
 	// }
 skip_fill1:
 	// val0 := br1.peekTopBits(peekBits)
 	MOVQ br_value, AX
 	MOVQ peek_bits, CX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v0 := table[val0&mask]
 	MOVW 0(table)(AX*2), AX // AX - v0
 	// br1.advance(uint8(v0.entry))
 	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CL, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// val1 := br1.peekTopBits(peekBits)
 	MOVQ peek_bits, CX
 	MOVQ br_value, AX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v1 := table[val1&mask]
 	MOVW 0(table)(AX*2), AX // AX - v1
 	// br1.advance(uint8(v1.entry))
 	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CX, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// these two writes get coalesced
 	// buf[stream][off] = uint8(v0.entry >> 8)
 	// buf[stream][off+1] = uint8(v1.entry >> 8)
 	MOVW BX, 256(buffer)(off*1)
 	// SECOND PART:
 	// val2 := br1.peekTopBits(peekBits)
 	MOVQ br_value, AX
 	MOVQ peek_bits, CX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v2 := table[val0&mask]
 	MOVW 0(table)(AX*2), AX // AX - v0
 	// br1.advance(uint8(v0.entry))
 	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CL, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// val3 := br1.peekTopBits(peekBits)
 	MOVQ peek_bits, CX
 	MOVQ br_value, AX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v3 := table[val1&mask]
 	MOVW 0(table)(AX*2), AX // AX - v1
 	// br1.advance(uint8(v1.entry))
 	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CX, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// these two writes get coalesced
 	// buf[stream][off+2] = uint8(v2.entry >> 8)
 	// buf[stream][off+3] = uint8(v3.entry >> 8)
 	MOVW BX, 256+2(buffer)(off*1)
 	// update the bitrader reader structure
 	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
 	MOVQ br_value, bitReaderShifted_value(br1)
 	MOVQ br_offset, bitReaderShifted_off(br1)
 	// const stream = 2
 	// br2.fillFast()
 	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
 	MOVQ    bitReaderShifted_value(br2), br_value
 	MOVQ    bitReaderShifted_off(br2), br_offset
 	// if b.bitsRead >= 32 {
 	CMPQ br_bits_read, $32
 	JB   skip_fill2
 	SUBQ $32, br_bits_read // b.bitsRead -= 32
 	SUBQ $4, br_offset     // b.off -= 4
 	// v := b.in[b.off-4 : b.off]
 	// v = v[:4]
 	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	MOVQ bitReaderShifted_in(br2), AX
 	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
 	// b.value |= uint64(low) << (b.bitsRead & 63)
 	MOVQ br_bits_read, CX
 	SHLQ CL, AX
 	ORQ  AX, br_value
 	// exhausted = exhausted || (br2.off < 4)
 	CMPQ  br_offset, $4
 	SETLT DL
 	ORB   DL, DH
 	// }
 skip_fill2:
 	// val0 := br2.peekTopBits(peekBits)
 	MOVQ br_value, AX
 	MOVQ peek_bits, CX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v0 := table[val0&mask]
 	MOVW 0(table)(AX*2), AX // AX - v0
 	// br2.advance(uint8(v0.entry))
 	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CL, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// val1 := br2.peekTopBits(peekBits)
 	MOVQ peek_bits, CX
 	MOVQ br_value, AX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v1 := table[val1&mask]
 	MOVW 0(table)(AX*2), AX // AX - v1
 	// br2.advance(uint8(v1.entry))
 	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CX, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// these two writes get coalesced
 	// buf[stream][off] = uint8(v0.entry >> 8)
 	// buf[stream][off+1] = uint8(v1.entry >> 8)
 	MOVW BX, 512(buffer)(off*1)
 	// SECOND PART:
 	// val2 := br2.peekTopBits(peekBits)
 	MOVQ br_value, AX
 	MOVQ peek_bits, CX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v2 := table[val0&mask]
 	MOVW 0(table)(AX*2), AX // AX - v0
 	// br2.advance(uint8(v0.entry))
 	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CL, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// val3 := br2.peekTopBits(peekBits)
 	MOVQ peek_bits, CX
 	MOVQ br_value, AX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v3 := table[val1&mask]
 	MOVW 0(table)(AX*2), AX // AX - v1
 	// br2.advance(uint8(v1.entry))
 	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CX, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// these two writes get coalesced
 	// buf[stream][off+2] = uint8(v2.entry >> 8)
 	// buf[stream][off+3] = uint8(v3.entry >> 8)
 	MOVW BX, 512+2(buffer)(off*1)
 	// update the bitrader reader structure
 	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
 	MOVQ br_value, bitReaderShifted_value(br2)
 	MOVQ br_offset, bitReaderShifted_off(br2)
 	// const stream = 3
 	// br3.fillFast()
 	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
 	MOVQ    bitReaderShifted_value(br3), br_value
 	MOVQ    bitReaderShifted_off(br3), br_offset
 	// if b.bitsRead >= 32 {
 	CMPQ br_bits_read, $32
 	JB   skip_fill3
 	SUBQ $32, br_bits_read // b.bitsRead -= 32
 	SUBQ $4, br_offset     // b.off -= 4
 	// v := b.in[b.off-4 : b.off]
 	// v = v[:4]
 	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	MOVQ bitReaderShifted_in(br3), AX
 	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
 	// b.value |= uint64(low) << (b.bitsRead & 63)
 	MOVQ br_bits_read, CX
 	SHLQ CL, AX
 	ORQ  AX, br_value
 	// exhausted = exhausted || (br3.off < 4)
 	CMPQ  br_offset, $4
 	SETLT DL
 	ORB   DL, DH
 	// }
 skip_fill3:
 	// val0 := br3.peekTopBits(peekBits)
 	MOVQ br_value, AX
 	MOVQ peek_bits, CX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v0 := table[val0&mask]
 	MOVW 0(table)(AX*2), AX // AX - v0
 	// br3.advance(uint8(v0.entry))
 	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CL, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// val1 := br3.peekTopBits(peekBits)
 	MOVQ peek_bits, CX
 	MOVQ br_value, AX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v1 := table[val1&mask]
 	MOVW 0(table)(AX*2), AX // AX - v1
 	// br3.advance(uint8(v1.entry))
 	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CX, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// these two writes get coalesced
 	// buf[stream][off] = uint8(v0.entry >> 8)
 	// buf[stream][off+1] = uint8(v1.entry >> 8)
 	MOVW BX, 768(buffer)(off*1)
 	// SECOND PART:
 	// val2 := br3.peekTopBits(peekBits)
 	MOVQ br_value, AX
 	MOVQ peek_bits, CX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v2 := table[val0&mask]
 	MOVW 0(table)(AX*2), AX // AX - v0
 	// br3.advance(uint8(v0.entry))
 	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CL, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// val3 := br3.peekTopBits(peekBits)
 	MOVQ peek_bits, CX
 	MOVQ br_value, AX
 	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
 	// v3 := table[val1&mask]
 	MOVW 0(table)(AX*2), AX // AX - v1
 	// br3.advance(uint8(v1.entry))
 	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
 	MOVBQZX AL, CX
 	SHLQ    CX, br_value     // value <<= n
 	ADDQ    CX, br_bits_read // bits_read += n
 	// these two writes get coalesced
 	// buf[stream][off+2] = uint8(v2.entry >> 8)
 	// buf[stream][off+3] = uint8(v3.entry >> 8)
 	MOVW BX, 768+2(buffer)(off*1)
 	// update the bitrader reader structure
 	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
 	MOVQ br_value, bitReaderShifted_value(br3)
 	MOVQ br_offset, bitReaderShifted_off(br3)
 	ADDQ $4, off // off += 2
 	TESTB DH, DH // any br[i].ofs < 4?
 	JNZ   end
 	CMPQ off, $bufoff
 	JL   main_loop
 end:
 	MOVQ 0(SP), BP
 	MOVB off, ret+56(FP)
 	RET
 #undef off
 #undef buffer
 #undef table
 #undef br_bits_read
 #undef br_value
 #undef br_offset
 #undef peek_bits
 #undef exhausted
 #undef br0
 #undef br1
 #undef br2
 #undef br3
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
@ -1,197 +0,0 @@
 // +build !appengine
 // +build gc
 // +build !noasm
 #include "textflag.h"
 #include "funcdata.h"
 #include "go_asm.h"
 #define bufoff      256     // see decompress.go, we're using [4][256]byte table
 //func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
 //	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
 TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
 #define off             R8
 #define buffer          DI
 #define table           SI
 #define br_bits_read    R9
 #define br_value        R10
 #define br_offset       R11
 #define peek_bits       R12
 #define exhausted       DX
 #define br0             R13
 #define br1             R14
 #define br2             R15
 #define br3             BP
    MOVQ    BP, 0(SP)
    XORQ    exhausted, exhausted    // exhausted = false
    XORQ    off, off                // off = 0
    MOVBQZX peekBits+32(FP), peek_bits
    MOVQ    buf+40(FP), buffer
    MOVQ    tbl+48(FP), table
    MOVQ    pbr0+0(FP), br0
    MOVQ    pbr1+8(FP), br1
    MOVQ    pbr2+16(FP), br2
    MOVQ    pbr3+24(FP), br3
 main_loop:
 {{ define "decode_2_values_x86" }}
    // const stream = {{ var "id" }}
    // br{{ var "id"}}.fillFast()
    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
 	// if b.bitsRead >= 32 {
    CMPQ    br_bits_read, $32
    JB      skip_fill{{ var "id" }}
    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
    SUBQ    $4, br_offset           // b.off -= 4
 	// v := b.in[b.off-4 : b.off]
 	// v = v[:4]
 	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
 	// b.value |= uint64(low) << (b.bitsRead & 63)
    MOVQ    br_bits_read, CX
    SHLQ    CL, AX
    ORQ     AX, br_value
    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
    CMPQ    br_offset, $4
    SETLT   DL
    ORB     DL, DH
    // }
 skip_fill{{ var "id" }}:
    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
    MOVQ    br_value, AX
    MOVQ    peek_bits, CX
    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
    // v0 := table[val0&mask]
    MOVW    0(table)(AX*2), AX      // AX - v0
    // br{{ var "id"}}.advance(uint8(v0.entry))
    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
    MOVBQZX AL, CX
    SHLQ    CL, br_value            // value <<= n
    ADDQ    CX, br_bits_read        // bits_read += n
    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
    MOVQ    peek_bits, CX
    MOVQ    br_value, AX
    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
    // v1 := table[val1&mask]
    MOVW    0(table)(AX*2), AX      // AX - v1
    // br{{ var "id"}}.advance(uint8(v1.entry))
    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
    MOVBQZX AL, CX
    SHLQ    CX, br_value            // value <<= n
    ADDQ    CX, br_bits_read        // bits_read += n
    // these two writes get coalesced
    // buf[stream][off] = uint8(v0.entry >> 8)
    // buf[stream][off+1] = uint8(v1.entry >> 8)
    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
    // SECOND PART:
    // val2 := br{{ var "id"}}.peekTopBits(peekBits)
    MOVQ    br_value, AX
    MOVQ    peek_bits, CX
    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
    // v2 := table[val0&mask]
    MOVW    0(table)(AX*2), AX      // AX - v0
    // br{{ var "id"}}.advance(uint8(v0.entry))
    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
    MOVBQZX AL, CX
    SHLQ    CL, br_value            // value <<= n
    ADDQ    CX, br_bits_read        // bits_read += n
    // val3 := br{{ var "id"}}.peekTopBits(peekBits)
    MOVQ    peek_bits, CX
    MOVQ    br_value, AX
    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
    // v3 := table[val1&mask]
    MOVW    0(table)(AX*2), AX      // AX - v1
    // br{{ var "id"}}.advance(uint8(v1.entry))
    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
    MOVBQZX AL, CX
    SHLQ    CX, br_value            // value <<= n
    ADDQ    CX, br_bits_read        // bits_read += n
    // these two writes get coalesced
    // buf[stream][off+2] = uint8(v2.entry >> 8)
    // buf[stream][off+3] = uint8(v3.entry >> 8)
    MOVW    BX, {{ var "bufofs" }}+2(buffer)(off*1)
    // update the bitrader reader structure
    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
 {{ end }}
    {{ set "id" "0" }}
    {{ set "ofs" "0" }}
    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
    {{ template "decode_2_values_x86" . }}
    {{ set "id" "1" }}
    {{ set "ofs" "8" }}
    {{ set "bufofs" "256" }}
    {{ template "decode_2_values_x86" . }}
    {{ set "id" "2" }}
    {{ set "ofs" "16" }}
    {{ set "bufofs" "512" }}
    {{ template "decode_2_values_x86" . }}
    {{ set "id" "3" }}
    {{ set "ofs" "24" }}
    {{ set "bufofs" "768" }}
    {{ template "decode_2_values_x86" . }}
    ADDQ    $4, off     // off += 2
    TESTB   DH, DH      // any br[i].ofs < 4?
    JNZ     end
    CMPQ    off, $bufoff
    JL      main_loop
 end:
    MOVQ    0(SP), BP
    MOVB    off, ret+56(FP)
    RET
 #undef  off
 #undef  buffer
 #undef  table
 #undef  br_bits_read
 #undef  br_value
 #undef  br_offset
 #undef  peek_bits
 #undef  exhausted
 #undef  br0
 #undef  br1
 #undef  br2
 #undef  br3
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@ -2,30 +2,40 @@
 // +build amd64,!appengine,!noasm,gc
 // This file contains the specialisation of Decoder.Decompress4X
-// that uses an asm implementation of its main loop.
+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
 package huff0
 import (
 	"errors"
 	"fmt"
 	"github.com/klauspost/compress/internal/cpuinfo"
 )
 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog > 8.
-// go:noescape
+//go:noescape
-func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
 	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
 // decompress4x_8b_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog <= 8 which decodes 4 entries
 // per loop.
-// go:noescape
+//go:noescape
-func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
 	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
 // fallback8BitSize is the size where using Go version is faster.
 const fallback8BitSize = 800
 type decompress4xContext struct {
 	pbr      *[4]bitReaderShifted
 	peekBits uint8
 	out      *byte
 	dstEvery int
 	tbl      *dEntrySingle
 	decoded  int
 	limit    *byte
 }
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
@ -42,6 +52,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	if cap(dst) < fallback8BitSize && use8BitTables {
 		return d.decompress4X8bit(dst, src)
 	}
 	var br [4]bitReaderShifted
 	// Decode "jump table"
 	start := 6
@ -71,70 +82,25 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 	// Use temp table to avoid bound checks/append penalty.
 	buf := d.buffer()
 	var off uint8
 	var decoded int
-	const debug = false
+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
-
+		ctx := decompress4xContext{
-	// see: bitReaderShifted.peekBitsFast()
+			pbr:      &br,
-	peekBits := uint8((64 - d.actualTableLog) & 63)
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
-
+			out:      &out[0],
-	// Decode 2 values from each decoder/loop.
+			dstEvery: dstEvery,
-	const bufoff = 256
+			tbl:      &single[0],
-	for {
+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
 		}
 		if use8BitTables {
-			off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+			decompress4x_8b_main_loop_amd64(&ctx)
 		} else {
-			off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+			decompress4x_main_loop_amd64(&ctx)
 		}
 		if debug {
 			fmt.Print("DEBUG: ")
 			fmt.Printf("off=%d,", off)
 			for i := 0; i < 4; i++ {
 				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
 					i, br[i].bitsRead, br[i].value, br[i].off)
 			}
 			fmt.Println("")
 		}
-		if off != 0 {
+		decoded = ctx.decoded
-			break
+		out = out[decoded/4:]
 		}
 		if bufoff > dstEvery {
 			d.bufs.Put(buf)
 			return nil, errors.New("corruption detected: stream overrun 1")
 		}
 		copy(out, buf[0][:])
 		copy(out[dstEvery:], buf[1][:])
 		copy(out[dstEvery*2:], buf[2][:])
 		copy(out[dstEvery*3:], buf[3][:])
 		out = out[bufoff:]
 		decoded += bufoff * 4
 		// There must at least be 3 buffers left.
 		if len(out) < dstEvery*3 {
 			d.bufs.Put(buf)
 			return nil, errors.New("corruption detected: stream overrun 2")
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
 			d.bufs.Put(buf)
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
 		copy(out, buf[0][:off])
 		copy(out[dstEvery:], buf[1][:off])
 		copy(out[dstEvery*2:], buf[2][:off])
 		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 	// Decode remaining.
@ -150,7 +116,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 		for bitsLeft > 0 {
 			br.fill()
 			if offset >= endsAt {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
@ -164,7 +129,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 			offset++
 		}
 		if offset != endsAt {
 			d.bufs.Put(buf)
 			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
 		}
 		decoded += offset - dstEvery*i
@ -173,9 +137,86 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 			return nil, err
 		}
 	}
 	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
 	return dst, nil
 }
 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress1X when tablelog > 8.
 //go:noescape
 func decompress1x_main_loop_amd64(ctx *decompress1xContext)
 // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
 // of Decompress1X when tablelog > 8.
 //go:noescape
 func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
 type decompress1xContext struct {
 	pbr      *bitReaderShifted
 	peekBits uint8
 	out      *byte
 	outCap   int
 	tbl      *dEntrySingle
 	decoded  int
 }
 // Error reported by asm implementations
 const error_max_decoded_size_exeeded = -1
 // Decompress1X will decompress a 1X encoded stream.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
 func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
 	var br bitReaderShifted
 	err := br.init(src)
 	if err != nil {
 		return dst, err
 	}
 	maxDecodedSize := cap(dst)
 	dst = dst[:maxDecodedSize]
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
 	if maxDecodedSize >= 4 {
 		ctx := decompress1xContext{
 			pbr:      &br,
 			out:      &dst[0],
 			outCap:   maxDecodedSize,
 			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
 			tbl:      &d.dt.single[0],
 		}
 		if cpuinfo.HasBMI2() {
 			decompress1x_main_loop_bmi2(&ctx)
 		} else {
 			decompress1x_main_loop_amd64(&ctx)
 		}
 		if ctx.decoded == error_max_decoded_size_exeeded {
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		dst = dst[:ctx.decoded]
 	}
 	// br < 8, so uint8 is fine
 	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
 	for bitsLeft > 0 {
 		br.fill()
 		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= nBits
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	return dst, br.close()
 }
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
@ -1,195 +0,0 @@
 // +build !appengine
 // +build gc
 // +build !noasm
 #include "textflag.h"
 #include "funcdata.h"
 #include "go_asm.h"
 #ifdef GOAMD64_v4
 #ifndef GOAMD64_v3
 #define GOAMD64_v3
 #endif
 #endif
 #define bufoff      256     // see decompress.go, we're using [4][256]byte table
 //func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
 //	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
 TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
 #define off             R8
 #define buffer          DI
 #define table           SI
 #define br_bits_read    R9
 #define br_value        R10
 #define br_offset       R11
 #define peek_bits       R12
 #define exhausted       DX
 #define br0             R13
 #define br1             R14
 #define br2             R15
 #define br3             BP
    MOVQ    BP, 0(SP)
    XORQ    exhausted, exhausted    // exhausted = false
    XORQ    off, off                // off = 0
    MOVBQZX peekBits+32(FP), peek_bits
    MOVQ    buf+40(FP), buffer
    MOVQ    tbl+48(FP), table
    MOVQ    pbr0+0(FP), br0
    MOVQ    pbr1+8(FP), br1
    MOVQ    pbr2+16(FP), br2
    MOVQ    pbr3+24(FP), br3
 main_loop:
 {{ define "decode_2_values_x86" }}
    // const stream = {{ var "id" }}
    // br{{ var "id"}}.fillFast()
    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
    // We must have at least 2 * max tablelog left
    CMPQ    br_bits_read, $64-22
    JBE     skip_fill{{ var "id" }}
    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
    SUBQ    $4, br_offset           // b.off -= 4
 	// v := b.in[b.off-4 : b.off]
 	// v = v[:4]
 	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
 	// b.value |= uint64(low) << (b.bitsRead & 63)
 #ifdef GOAMD64_v3
    SHLXQ   br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
 #else
    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
    MOVQ    br_bits_read, CX
    SHLQ    CL, AX
 #endif
    ORQ     AX, br_value
    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
    CMPQ    br_offset, $4
    SETLT   DL
    ORB     DL, DH
    // }
 skip_fill{{ var "id" }}:
    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
 #ifdef GOAMD64_v3
    SHRXQ   peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
 #else
    MOVQ    br_value, AX
    MOVQ    peek_bits, CX
    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
 #endif
    // v0 := table[val0&mask]
    MOVW    0(table)(AX*2), AX      // AX - v0
    // br{{ var "id"}}.advance(uint8(v0.entry))
    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
 #ifdef GOAMD64_v3
    MOVBQZX AL, CX
    SHLXQ   AX, br_value, br_value // value <<= n
 #else
    MOVBQZX AL, CX
    SHLQ    CL, br_value            // value <<= n
 #endif
    ADDQ    CX, br_bits_read        // bits_read += n
 #ifdef GOAMD64_v3
    SHRXQ    peek_bits, br_value, AX  // AX = (value >> peek_bits) & mask
 #else
    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
    MOVQ    peek_bits, CX
    MOVQ    br_value, AX
    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
 #endif
    // v1 := table[val1&mask]
    MOVW    0(table)(AX*2), AX      // AX - v1
    // br{{ var "id"}}.advance(uint8(v1.entry))
    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
 #ifdef GOAMD64_v3
    MOVBQZX AL, CX
    SHLXQ   AX, br_value, br_value // value <<= n
 #else
    MOVBQZX AL, CX
    SHLQ    CL, br_value            // value <<= n
 #endif
    ADDQ    CX, br_bits_read        // bits_read += n
    // these two writes get coalesced
    // buf[stream][off] = uint8(v0.entry >> 8)
    // buf[stream][off+1] = uint8(v1.entry >> 8)
    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
    // update the bitrader reader structure
    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
 {{ end }}
    {{ set "id" "0" }}
    {{ set "ofs" "0" }}
    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
    {{ template "decode_2_values_x86" . }}
    {{ set "id" "1" }}
    {{ set "ofs" "8" }}
    {{ set "bufofs" "256" }}
    {{ template "decode_2_values_x86" . }}
    {{ set "id" "2" }}
    {{ set "ofs" "16" }}
    {{ set "bufofs" "512" }}
    {{ template "decode_2_values_x86" . }}
    {{ set "id" "3" }}
    {{ set "ofs" "24" }}
    {{ set "bufofs" "768" }}
    {{ template "decode_2_values_x86" . }}
    ADDQ    $2, off     // off += 2
    TESTB   DH, DH      // any br[i].ofs < 4?
    JNZ     end
    CMPQ    off, $bufoff
    JL      main_loop
 end:
    MOVQ    0(SP), BP
    MOVB    off, ret+56(FP)
    RET
 #undef  off
 #undef  buffer
 #undef  table
 #undef  br_bits_read
 #undef  br_value
 #undef  br_offset
 #undef  peek_bits
 #undef  exhausted
 #undef  br0
 #undef  br1
 #undef  br2
 #undef  br3
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@ -191,3 +191,105 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	}
 	return dst, nil
 }
 // Decompress1X will decompress a 1X encoded stream.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
 func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
 	if use8BitTables && d.actualTableLog <= 8 {
 		return d.decompress1X8Bit(dst, src)
 	}
 	var br bitReaderShifted
 	err := br.init(src)
 	if err != nil {
 		return dst, err
 	}
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]
 	// Avoid bounds check by always having full sized table.
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
 	dt := d.dt.single[:tlSize]
 	// Use temp table to avoid bound checks/append penalty.
 	bufs := d.buffer()
 	buf := &bufs[0]
 	var off uint8
 	for br.off >= 8 {
 		br.fillFast()
 		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)
 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)
 		// Refill
 		br.fillFast()
 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)
 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
 				d.bufs.Put(bufs)
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
 		}
 	}
 	if len(dst)+int(off) > maxDecodedSize {
 		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
 	dst = append(dst, buf[:off]...)
 	// br < 8, so uint8 is fine
 	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
 	for bitsLeft > 0 {
 		br.fill()
 		if false && br.bitsRead >= 32 {
 			if br.off >= 4 {
 				v := br.in[br.off-4:]
 				v = v[:4]
 				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 				br.value = (br.value << 32) | uint64(low)
 				br.bitsRead -= 32
 				br.off -= 4
 			} else {
 				for br.off > 0 {
 					br.value = (br.value << 8) | uint64(br.in[br.off-1])
 					br.bitsRead -= 8
 					br.off--
 				}
 			}
 		}
 		if len(dst) >= maxDecodedSize {
 			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= nBits
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	d.bufs.Put(bufs)
 	return dst, br.close()
 }
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
@ -0,0 +1,34 @@
 // Package cpuinfo gives runtime info about the current CPU.
 //
 // This is a very limited module meant for use internally
 // in this project. For more versatile solution check
 // https://github.com/klauspost/cpuid.
 package cpuinfo
 // HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
 func HasBMI1() bool {
 	return hasBMI1
 }
 // HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
 func HasBMI2() bool {
 	return hasBMI2
 }
 // DisableBMI2 will disable BMI2, for testing purposes.
 // Call returned function to restore previous state.
 func DisableBMI2() func() {
 	old := hasBMI2
 	hasBMI2 = false
 	return func() {
 		hasBMI2 = old
 	}
 }
 // HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
 func HasBMI() bool {
 	return HasBMI1() && HasBMI2()
 }
 var hasBMI1 bool
 var hasBMI2 bool
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
@ -0,0 +1,11 @@
 //go:build amd64 && !appengine && !noasm && gc
 // +build amd64,!appengine,!noasm,gc
 package cpuinfo
 // go:noescape
 func x86extensions() (bmi1, bmi2 bool)
 func init() {
 	hasBMI1, hasBMI2 = x86extensions()
 }
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
@ -0,0 +1,36 @@
 // +build !appengine
 // +build gc
 // +build !noasm
 #include "textflag.h"
 #include "funcdata.h"
 #include "go_asm.h"
 TEXT ·x86extensions(SB), NOSPLIT, $0
 	// 1. determine max EAX value
 	XORQ AX, AX
 	CPUID
 	CMPQ AX, $7
 	JB   unsupported
 	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
 	MOVQ $7, AX
 	MOVQ $0, CX
 	CPUID
 	BTQ   $3, BX // bit 3 = BMI1
 	SETCS AL
 	BTQ   $8, BX // bit 8 = BMI2
 	SETCS AH
 	MOVB AL, bmi1+0(FP)
 	MOVB AH, bmi2+1(FP)
 	RET
 unsupported:
 	XORQ AX, AX
 	MOVB AL, bmi1+0(FP)
 	MOVB AL, bmi2+1(FP)
 	RET
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@ -386,47 +386,31 @@ In practice this means that concurrency is often limited to utilizing about 3 co
 ### Benchmarks
 These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
 The first two are streaming decodes and the last are smaller inputs. 
- 
+
 Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
 ```
-BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
+BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
-BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
+BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
-BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
+Concurrent blocks, performance:
 BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
-Concurrent performance:
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
-
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
 BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
 ```
-This reflects the performance around May 2020, but this may be out of date.
+This reflects the performance around May 2022, but this may be out of date.
 ## Zstd inside ZIP files
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@ -63,13 +63,6 @@ func (b *bitReader) get32BitsFast(n uint8) uint32 {
 	return v
 }
 func (b *bitReader) get16BitsFast(n uint8) uint16 {
 	const regMask = 64 - 1
 	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
 	b.bitsRead += n
 	return v
 }
 // fillFast() will make sure at least 32 bits are available.
 // There must be at least 4 bytes available.
 func (b *bitReader) fillFast() {
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@ -5,8 +5,6 @@
 package zstd
 import "fmt"
 // bitWriter will write bits.
 // First bit will be LSB of the first byte of output.
 type bitWriter struct {
@ -73,80 +71,6 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
 	b.nBits += bits
 }
 // flush will flush all pending full bytes.
 // There will be at least 56 bits available for writing when this has been called.
 // Using flush32 is faster, but leaves less space for writing.
 func (b *bitWriter) flush() {
 	v := b.nBits >> 3
 	switch v {
 	case 0:
 	case 1:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 		)
 	case 2:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 		)
 	case 3:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 		)
 	case 4:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 		)
 	case 5:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 			byte(b.bitContainer>>32),
 		)
 	case 6:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 			byte(b.bitContainer>>32),
 			byte(b.bitContainer>>40),
 		)
 	case 7:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 			byte(b.bitContainer>>32),
 			byte(b.bitContainer>>40),
 			byte(b.bitContainer>>48),
 		)
 	case 8:
 		b.out = append(b.out,
 			byte(b.bitContainer),
 			byte(b.bitContainer>>8),
 			byte(b.bitContainer>>16),
 			byte(b.bitContainer>>24),
 			byte(b.bitContainer>>32),
 			byte(b.bitContainer>>40),
 			byte(b.bitContainer>>48),
 			byte(b.bitContainer>>56),
 		)
 	default:
 		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
 	}
 	b.bitContainer >>= v << 3
 	b.nBits &= 7
 }
 // flush32 will flush out, so there are at least 32 bits available for writing.
 func (b *bitWriter) flush32() {
 	if b.nBits < 32 {
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@ -5,9 +5,14 @@
 package zstd
 import (
 	"bytes"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"io/ioutil"
 	"os"
 	"path/filepath"
 	"sync"
 	"github.com/klauspost/compress/huff0"
@ -38,14 +43,14 @@ const (
 	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
 	maxCompressedBlockSize = 128 << 10
 	compressedBlockOverAlloc    = 16
 	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
 	// Maximum possible block size (all Raw+Uncompressed).
 	maxBlockSize = (1 << 21) - 1
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
+	maxMatchLen  = 131074
-	maxCompressedLiteralSize = 1 << 18
+	maxSequences = 0x7f00 + 0xffff
 	maxRLELiteralSize        = 1 << 20
 	maxMatchLen              = 131074
 	maxSequences             = 0x7f00 + 0xffff
 	// We support slightly less than the reference decoder to be able to
 	// use ints on 32 bit archs.
@ -97,7 +102,6 @@ type blockDec struct {
 	// Block is RLE, this is the size.
 	RLESize uint32
 	tmp     [4]byte
 	Type blockType
@ -136,7 +140,7 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	b.Type = blockType((bh >> 1) & 3)
 	// find size.
 	cSize := int(bh >> 3)
-	maxSize := maxBlockSize
+	maxSize := maxCompressedBlockSizeAlloc
 	switch b.Type {
 	case blockTypeReserved:
 		return ErrReservedBlockType
@ -157,9 +161,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 			println("Data size on stream:", cSize)
 		}
 		b.RLESize = 0
-		maxSize = maxCompressedBlockSize
+		maxSize = maxCompressedBlockSizeAlloc
 		if windowSize < maxCompressedBlockSize && b.lowMem {
-			maxSize = int(windowSize)
+			maxSize = int(windowSize) + compressedBlockOverAlloc
 		}
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
 			if debugDecoder {
@ -190,9 +194,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	// Read block data.
 	if cap(b.dataStorage) < cSize {
 		if b.lowMem || cSize > maxCompressedBlockSize {
-			b.dataStorage = make([]byte, 0, cSize)
+			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
 		} else {
-			b.dataStorage = make([]byte, 0, maxCompressedBlockSize)
+			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
 		}
 	}
 	if cap(b.dst) <= maxSize {
@ -360,14 +364,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 		}
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
-				b.literalBuf = make([]byte, litRegenSize)
+				b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
 			} else {
-				if litRegenSize > maxCompressedLiteralSize {
+				b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
 					// Exceptional
 					b.literalBuf = make([]byte, litRegenSize)
 				} else {
 					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
 				}
 			}
 		}
 		literals = b.literalBuf[:litRegenSize]
@ -397,14 +396,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 		// Ensure we have space to store it.
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize)
+				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
 			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
 			}
 		}
 		var err error
 		// Use our out buffer.
-		huff.MaxDecodedSize = maxCompressedBlockSize
+		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
@ -429,9 +428,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 		// Ensure we have space to store it.
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize)
+				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
 			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
 			}
 		}
 		huff := hist.huffTree
@ -448,7 +447,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 			return in, err
 		}
 		hist.huffTree = huff
-		huff.MaxDecodedSize = maxCompressedBlockSize
+		huff.MaxDecodedSize = litRegenSize
 		// Use our out buffer.
 		if fourStreams {
 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@ -463,6 +462,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 		if len(literals) != litRegenSize {
 			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
 		// Re-cap to get extra size.
 		literals = b.literalBuf[:len(literals)]
 		if debugDecoder {
 			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
 		}
@ -486,10 +487,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		b.dst = append(b.dst, hist.decoders.literals...)
 		return nil
 	}
-	err = hist.decoders.decodeSync(hist)
+	before := len(hist.decoders.out)
 	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
 	if err != nil {
 		return err
 	}
 	if hist.decoders.maxSyncLen > 0 {
 		hist.decoders.maxSyncLen += uint64(before)
 		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
 	}
 	b.dst = hist.decoders.out
 	hist.recentOffsets = hist.decoders.prevOffset
 	return nil
@ -632,6 +638,22 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
 		println("initializing sequences:", err)
 		return err
 	}
 	// Extract blocks...
 	if false && hist.dict == nil {
 		fatalErr := func(err error) {
 			if err != nil {
 				panic(err)
 			}
 		}
 		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
 		var buf bytes.Buffer
 		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
 		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
 		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
 		buf.Write(in)
 		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
 	}
 	return nil
 }
@ -650,6 +672,7 @@ func (b *blockDec) decodeSequences(hist *history) error {
 	}
 	hist.decoders.windowSize = hist.windowSize
 	hist.decoders.prevOffset = hist.recentOffsets
 	err := hist.decoders.decode(b.sequence)
 	hist.recentOffsets = hist.decoders.prevOffset
 	return err
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@ -23,7 +23,7 @@ type byteBuffer interface {
 	readByte() (byte, error)
 	// Skip n bytes.
-	skipN(n int) error
+	skipN(n int64) error
 }
 // in-memory buffer
@ -52,10 +52,6 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
 	return r, nil
 }
 func (b *byteBuf) remain() []byte {
 	return *b
 }
 func (b *byteBuf) readByte() (byte, error) {
 	bb := *b
 	if len(bb) < 1 {
@ -66,9 +62,12 @@ func (b *byteBuf) readByte() (byte, error) {
 	return r, nil
 }
-func (b *byteBuf) skipN(n int) error {
+func (b *byteBuf) skipN(n int64) error {
 	bb := *b
-	if len(bb) < n {
+	if n < 0 {
 		return fmt.Errorf("negative skip (%d) requested", n)
 	}
 	if int64(len(bb)) < n {
 		return io.ErrUnexpectedEOF
 	}
 	*b = bb[n:]
@ -124,9 +123,9 @@ func (r *readerWrapper) readByte() (byte, error) {
 	return r.tmp[0], nil
 }
-func (r *readerWrapper) skipN(n int) error {
+func (r *readerWrapper) skipN(n int64) error {
-	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
+	n2, err := io.CopyN(ioutil.Discard, r.r, n)
-	if n2 != int64(n) {
+	if n2 != n {
 		err = io.ErrUnexpectedEOF
 	}
 	return err
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@ -13,12 +13,6 @@ type byteReader struct {
 	off int
 }
 // init will initialize the reader and set the input.
 func (b *byteReader) init(in []byte) {
 	b.b = in
 	b.off = 0
 }
 // advance the stream b n bytes.
 func (b *byteReader) advance(n uint) {
 	b.off += int(n)
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@ -347,18 +347,23 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 			}
 			frame.history.setDict(&dict)
 		}
-
+		if frame.WindowSize > d.o.maxWindowSize {
-		if frame.FrameContentSize != fcsUnknown && frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+			if debugDecoder {
-			return dst, ErrDecoderSizeExceeded
+				println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
 			}
 			return dst, ErrWindowSizeExceeded
 		}
-		if frame.FrameContentSize < 1<<30 {
+		if frame.FrameContentSize != fcsUnknown {
-			// Never preallocate more than 1 GB up front.
+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
 				return dst, ErrDecoderSizeExceeded
 			}
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
-				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
+				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
 				copy(dst2, dst)
 				dst = dst2
 			}
 		}
 		if cap(dst) == 0 {
 			// Allocate len(input) * 2 by default if nothing is provided
 			// and we didn't get frame content size.
@ -437,7 +442,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
 		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
 	}
-	if len(next.b) > 0 {
+	if !d.o.ignoreChecksum && len(next.b) > 0 {
 		n, err := d.current.crc.Write(next.b)
 		if err == nil {
 			if n != len(next.b) {
@ -449,7 +454,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
 		got := d.current.crc.Sum64()
 		var tmp [4]byte
 		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
-		if !bytes.Equal(tmp[:], next.d.checkCRC) && !ignoreCRC {
+		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
 			if debugDecoder {
 				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
 			}
@ -533,9 +538,15 @@ func (d *Decoder) nextBlockSync() (ok bool) {
 		// Update/Check CRC
 		if d.frame.HasCheckSum {
-			d.frame.crc.Write(d.current.b)
+			if !d.o.ignoreChecksum {
 				d.frame.crc.Write(d.current.b)
 			}
 			if d.current.d.Last {
-				d.current.err = d.frame.checkCRC()
+				if !d.o.ignoreChecksum {
 					d.current.err = d.frame.checkCRC()
 				} else {
 					d.current.err = d.frame.consumeCRC()
 				}
 				if d.current.err != nil {
 					println("CRC error:", d.current.err)
 					return false
@ -629,60 +640,18 @@ func (d *Decoder) startSyncDecoder(r io.Reader) error {
 // Create Decoder:
 // ASYNC:
-// Spawn 4 go routines.
+// Spawn 3 go routines.
-// 0: Read frames and decode blocks.
+// 0: Read frames and decode block literals.
-// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
+// 1: Decode sequences.
-// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
+// 2: Execute sequences, send to output.
 // 3: Wait for stream history, execute sequences, send stream history.
 func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
 	defer d.streamWg.Done()
 	br := readerWrapper{r: r}
 	var seqPrepare = make(chan *blockDec, d.o.concurrent)
 	var seqDecode = make(chan *blockDec, d.o.concurrent)
 	var seqExecute = make(chan *blockDec, d.o.concurrent)
-	// Async 1: Prepare blocks...
+	// Async 1: Decode sequences...
 	go func() {
 		var hist history
 		var hasErr bool
 		for block := range seqPrepare {
 			if hasErr {
 				if block != nil {
 					seqDecode <- block
 				}
 				continue
 			}
 			if block.async.newHist != nil {
 				if debugDecoder {
 					println("Async 1: new history")
 				}
 				hist.reset()
 				if block.async.newHist.dict != nil {
 					hist.setDict(block.async.newHist.dict)
 				}
 			}
 			if block.err != nil || block.Type != blockTypeCompressed {
 				hasErr = block.err != nil
 				seqDecode <- block
 				continue
 			}
 			remain, err := block.decodeLiterals(block.data, &hist)
 			block.err = err
 			hasErr = block.err != nil
 			if err == nil {
 				block.async.literals = hist.decoders.literals
 				block.async.seqData = remain
 			} else if debugDecoder {
 				println("decodeLiterals error:", err)
 			}
 			seqDecode <- block
 		}
 		close(seqDecode)
 	}()
 	// Async 2: Decode sequences...
 	go func() {
 		var hist history
 		var hasErr bool
@ -696,7 +665,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 			}
 			if block.async.newHist != nil {
 				if debugDecoder {
-					println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
+					println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
 				}
 				hist.decoders = block.async.newHist.decoders
 				hist.recentOffsets = block.async.newHist.recentOffsets
@ -750,7 +719,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 			}
 			if block.async.newHist != nil {
 				if debugDecoder {
-					println("Async 3: new history")
+					println("Async 2: new history")
 				}
 				hist.windowSize = block.async.newHist.windowSize
 				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
@ -837,6 +806,33 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 decodeStream:
 	for {
 		var hist history
 		var hasErr bool
 		decodeBlock := func(block *blockDec) {
 			if hasErr {
 				if block != nil {
 					seqDecode <- block
 				}
 				return
 			}
 			if block.err != nil || block.Type != blockTypeCompressed {
 				hasErr = block.err != nil
 				seqDecode <- block
 				return
 			}
 			remain, err := block.decodeLiterals(block.data, &hist)
 			block.err = err
 			hasErr = block.err != nil
 			if err == nil {
 				block.async.literals = hist.decoders.literals
 				block.async.seqData = remain
 			} else if debugDecoder {
 				println("decodeLiterals error:", err)
 			}
 			seqDecode <- block
 		}
 		frame := d.frame
 		if debugDecoder {
 			println("New frame...")
@ -863,7 +859,7 @@ decodeStream:
 			case <-ctx.Done():
 			case dec := <-d.decoders:
 				dec.sendErr(err)
-				seqPrepare <- dec
+				decodeBlock(dec)
 			}
 			break decodeStream
 		}
@ -883,6 +879,10 @@ decodeStream:
 				if debugDecoder {
 					println("Alloc History:", h.allocFrameBuffer)
 				}
 				hist.reset()
 				if h.dict != nil {
 					hist.setDict(h.dict)
 				}
 				dec.async.newHist = &h
 				dec.async.fcs = frame.FrameContentSize
 				historySent = true
@ -909,7 +909,7 @@ decodeStream:
 			}
 			err = dec.err
 			last := dec.Last
-			seqPrepare <- dec
+			decodeBlock(dec)
 			if err != nil {
 				break decodeStream
 			}
@ -918,7 +918,7 @@ decodeStream:
 			}
 		}
 	}
-	close(seqPrepare)
+	close(seqDecode)
 	wg.Wait()
 	d.frame.history.b = frameHistCache
 }
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@ -19,6 +19,7 @@ type decoderOptions struct {
 	maxDecodedSize uint64
 	maxWindowSize  uint64
 	dicts          []dict
 	ignoreChecksum bool
 }
 func (o *decoderOptions) setDefault() {
@ -31,7 +32,7 @@ func (o *decoderOptions) setDefault() {
 	if o.concurrent > 4 {
 		o.concurrent = 4
 	}
-	o.maxDecodedSize = 1 << 63
+	o.maxDecodedSize = 64 << 30
 }
 // WithDecoderLowmem will set whether to use a lower amount of memory,
@ -66,7 +67,7 @@ func WithDecoderConcurrency(n int) DOption {
 // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
 // non-streaming operations or maximum window size for streaming operations.
 // This can be used to control memory usage of potentially hostile content.
-// Maximum and default is 1 << 63 bytes.
+// Maximum is 1 << 63 bytes. Default is 64GiB.
 func WithDecoderMaxMemory(n uint64) DOption {
 	return func(o *decoderOptions) error {
 		if n == 0 {
@ -112,3 +113,11 @@ func WithDecoderMaxWindow(size uint64) DOption {
 		return nil
 	}
 }
 // IgnoreChecksum allows to forcibly ignore checksum checking.
 func IgnoreChecksum(b bool) DOption {
 	return func(o *decoderOptions) error {
 		o.ignoreChecksum = b
 		return nil
 	}
 }
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@ -156,8 +156,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
@ -518,8 +518,8 @@ encodeLoop:
 			}
 			// Store this, since we have it.
 			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
@ -674,8 +674,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
@ -1047,8 +1047,8 @@ encodeLoop:
 			}
 			// Store this, since we have it.
 			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@ -127,8 +127,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
@ -439,8 +439,8 @@ encodeLoop:
 		var t int32
 		for {
 			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
@ -785,8 +785,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
@ -969,7 +969,7 @@ encodeLoop:
 		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
 		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
 		longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
-		longHash2 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
+		longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
 		e.longTable[longHash1] = te0
 		e.longTable[longHash2] = te1
 		e.markLongShardDirty(longHash1)
@ -1002,8 +1002,8 @@ encodeLoop:
 			}
 			// Store this, since we have it.
 			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		// If a non-single block is needed the encoder will reset again.
 		e.encoders <- enc
 	}()
-	// Use single segments when above minimum window and below 1MB.
+	// Use single segments when above minimum window and below window size.
-	single := len(src) < 1<<20 && len(src) > MinWindowSize
+	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
 	if e.o.single != nil {
 		single = *e.o.single
 	}
@ -551,7 +551,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	}
 	// If we can do everything in one block, prefer that.
-	if len(src) <= maxCompressedBlockSize {
+	if len(src) <= e.o.blockSize {
 		enc.Reset(e.o.dict, true)
 		// Slightly faster with no history and everything in one block.
 		if e.o.crc {
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption {
 // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
 // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
 // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size.
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
 // This setting has no effect on streamed encodes.
 func WithSingleSegment(b bool) EOption {
 	return func(o *encoderOptions) error {
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 		}
 		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		println("Skipping frame with", n, "bytes.")
-		err = br.skipN(int(n))
+		err = br.skipN(int64(n))
 		if err != nil {
 			if debugDecoder {
 				println("Reading discarded frame", err)
@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error {
 		d.crc.Reset()
 	}
 	if d.WindowSize > d.o.maxWindowSize {
 		if debugDecoder {
 			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
 		}
 		return ErrWindowSizeExceeded
 	}
 	if d.WindowSize == 0 && d.SingleSegment {
 		// We may not need window in this case.
 		d.WindowSize = d.FrameContentSize
 		if d.WindowSize < MinWindowSize {
 			d.WindowSize = MinWindowSize
 		}
 		if d.WindowSize > d.o.maxDecodedSize {
 			if debugDecoder {
 				printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
 			}
 			return ErrDecoderSizeExceeded
 		}
 	}
 	if d.WindowSize > uint64(d.o.maxWindowSize) {
 		if debugDecoder {
 			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
 		}
 		return ErrWindowSizeExceeded
 	}
 	// The minimum Window_Size is 1 KB.
 	if d.WindowSize < MinWindowSize {
 		if debugDecoder {
@ -253,10 +260,11 @@ func (d *frameDec) reset(br byteBuffer) error {
 		return ErrWindowSizeTooSmall
 	}
 	d.history.windowSize = int(d.WindowSize)
-	if d.o.lowMem && d.history.windowSize < maxBlockSize {
+	if !d.o.lowMem || d.history.windowSize < maxBlockSize {
 		// Alloc 2x window size if not low-mem, or very small window size.
 		d.history.allocFrameBuffer = d.history.windowSize * 2
 		// TODO: Maybe use FrameContent size
 	} else {
 		// Alloc with one additional block
 		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
 	}
@ -290,13 +298,6 @@ func (d *frameDec) checkCRC() error {
 	if !d.HasCheckSum {
 		return nil
 	}
 	var tmp [4]byte
 	got := d.crc.Sum64()
 	// Flip to match file order.
 	tmp[0] = byte(got >> 0)
 	tmp[1] = byte(got >> 8)
 	tmp[2] = byte(got >> 16)
 	tmp[3] = byte(got >> 24)
 	// We can overwrite upper tmp now
 	want, err := d.rawInput.readSmall(4)
@ -305,7 +306,19 @@ func (d *frameDec) checkCRC() error {
 		return err
 	}
-	if !bytes.Equal(tmp[:], want) && !ignoreCRC {
+	if d.o.ignoreChecksum {
 		return nil
 	}
 	var tmp [4]byte
 	got := d.crc.Sum64()
 	// Flip to match file order.
 	tmp[0] = byte(got >> 0)
 	tmp[1] = byte(got >> 8)
 	tmp[2] = byte(got >> 16)
 	tmp[3] = byte(got >> 24)
 	if !bytes.Equal(tmp[:], want) {
 		if debugDecoder {
 			println("CRC Check Failed:", tmp[:], "!=", want)
 		}
@ -317,6 +330,19 @@ func (d *frameDec) checkCRC() error {
 	return nil
 }
 // consumeCRC reads the checksum data if the frame has one.
 func (d *frameDec) consumeCRC() error {
 	if d.HasCheckSum {
 		_, err := d.rawInput.readSmall(4)
 		if err != nil {
 			println("CRC missing?", err)
 			return err
 		}
 	}
 	return nil
 }
 // runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	saved := d.history.b
@ -326,6 +352,19 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	d.history.ignoreBuffer = len(dst)
 	// Store input length, so we only check new data.
 	crcStart := len(dst)
 	d.history.decoders.maxSyncLen = 0
 	if d.FrameContentSize != fcsUnknown {
 		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
 		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
 			return dst, ErrDecoderSizeExceeded
 		}
 		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
 			// Alloc for output
 			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
 			copy(dst2, dst)
 			dst = dst2
 		}
 	}
 	var err error
 	for {
 		err = dec.reset(d.rawInput, d.WindowSize)
@ -360,13 +399,17 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
 			err = ErrFrameSizeMismatch
 		} else if d.HasCheckSum {
-			var n int
+			if d.o.ignoreChecksum {
-			n, err = d.crc.Write(dst[crcStart:])
+				err = d.consumeCRC()
-			if err == nil {
+			} else {
-				if n != len(dst)-crcStart {
+				var n int
-					err = io.ErrShortWrite
+				n, err = d.crc.Write(dst[crcStart:])
-				} else {
+				if err == nil {
-					err = d.checkCRC()
+					if n != len(dst)-crcStart {
 						err = io.ErrShortWrite
 					} else {
 						err = d.checkCRC()
 					}
 				}
 			}
 		}
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@ -5,8 +5,10 @@
 package zstd
 import (
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 )
 const (
@ -178,10 +180,32 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
 	}
 	b.advance((bitCount + 7) >> 3)
 	// println(s.norm[:s.symbolLen], s.symbolLen)
 	return s.buildDtable()
 }
 func (s *fseDecoder) mustReadFrom(r io.Reader) {
 	fatalErr := func(err error) {
 		if err != nil {
 			panic(err)
 		}
 	}
 	// 	dt             [maxTablesize]decSymbol // Decompression table.
 	//	symbolLen      uint16                  // Length of active part of the symbol table.
 	//	actualTableLog uint8                   // Selected tablelog.
 	//	maxBits        uint8                   // Maximum number of additional bits
 	//	// used for table creation to avoid allocations.
 	//	stateTable [256]uint16
 	//	norm       [maxSymbolValue + 1]int16
 	//	preDefined bool
 	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
 	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
 	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
 	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
 	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
 	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
 	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
 }
 // decSymbol contains information about a state entry,
 // Including the state offset base, the output symbol and
 // the number of bits to read for the low part of the destination state.
@ -204,18 +228,10 @@ func (d decSymbol) newState() uint16 {
 	return uint16(d >> 16)
 }
 func (d decSymbol) baseline() uint32 {
 	return uint32(d >> 32)
 }
 func (d decSymbol) baselineInt() int {
 	return int(d >> 32)
 }
 func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
 	*d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
 }
 func (d *decSymbol) setNBits(nBits uint8) {
 	const mask = 0xffffffffffffff00
 	*d = (*d & mask) | decSymbol(nBits)
@ -231,11 +247,6 @@ func (d *decSymbol) setNewState(state uint16) {
 	*d = (*d & mask) | decSymbol(state)<<16
 }
 func (d *decSymbol) setBaseline(baseline uint32) {
 	const mask = 0xffffffff
 	*d = (*d & mask) | decSymbol(baseline)<<32
 }
 func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
 	const mask = 0xffff00ff
 	*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
@ -257,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) {
 	s.dt[0] = symbol
 }
 // buildDtable will build the decoding table.
 func (s *fseDecoder) buildDtable() error {
 	tableSize := uint32(1 << s.actualTableLog)
 	highThreshold := tableSize - 1
 	symbolNext := s.stateTable[:256]
 	// Init, lay down lowprob symbols
 	{
 		for i, v := range s.norm[:s.symbolLen] {
 			if v == -1 {
 				s.dt[highThreshold].setAddBits(uint8(i))
 				highThreshold--
 				symbolNext[i] = 1
 			} else {
 				symbolNext[i] = uint16(v)
 			}
 		}
 	}
 	// Spread symbols
 	{
 		tableMask := tableSize - 1
 		step := tableStep(tableSize)
 		position := uint32(0)
 		for ss, v := range s.norm[:s.symbolLen] {
 			for i := 0; i < int(v); i++ {
 				s.dt[position].setAddBits(uint8(ss))
 				position = (position + step) & tableMask
 				for position > highThreshold {
 					// lowprob area
 					position = (position + step) & tableMask
 				}
 			}
 		}
 		if position != 0 {
 			// position must reach all cells once, otherwise normalizedCounter is incorrect
 			return errors.New("corrupted input (position != 0)")
 		}
 	}
 	// Build Decoding table
 	{
 		tableSize := uint16(1 << s.actualTableLog)
 		for u, v := range s.dt[:tableSize] {
 			symbol := v.addBits()
 			nextState := symbolNext[symbol]
 			symbolNext[symbol] = nextState + 1
 			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
 			s.dt[u&maxTableMask].setNBits(nBits)
 			newState := (nextState << nBits) - tableSize
 			if newState > tableSize {
 				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
 			}
 			if newState == uint16(u) && nBits == 0 {
 				// Seems weird that this is possible with nbits > 0.
 				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
 			}
 			s.dt[u&maxTableMask].setNewState(newState)
 		}
 	}
 	return nil
 }
 // transform will transform the decoder table into a table usable for
 // decoding without having to apply the transformation while decoding.
 // The state will contain the base value and the number of bits to read.
@ -352,34 +301,7 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
 	s.state = dt[br.getBits(tableLog)]
 }
 // next returns the current symbol and sets the next state.
 // At least tablelog bits must be available in the bit reader.
 func (s *fseState) next(br *bitReader) {
 	lowBits := uint16(br.getBits(s.state.nbBits()))
 	s.state = s.dt[s.state.newState()+lowBits]
 }
 // finished returns true if all bits have been read from the bitstream
 // and the next state would require reading bits from the input.
 func (s *fseState) finished(br *bitReader) bool {
 	return br.finished() && s.state.nbBits() > 0
 }
 // final returns the current state symbol without decoding the next.
 func (s *fseState) final() (int, uint8) {
 	return s.state.baselineInt(), s.state.addBits()
 }
 // final returns the current state symbol without decoding the next.
 func (s decSymbol) final() (int, uint8) {
 	return s.baselineInt(), s.addBits()
 }
 // nextFast returns the next symbol and sets the next state.
 // This can only be used if no symbols are 0 bits.
 // At least tablelog bits must be available in the bit reader.
 func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
 	lowBits := br.get16BitsFast(s.state.nbBits())
 	s.state = s.dt[s.state.newState()+lowBits]
 	return s.state.baseline(), s.state.addBits()
 }
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@ -0,0 +1,64 @@
 //go:build amd64 && !appengine && !noasm && gc
 // +build amd64,!appengine,!noasm,gc
 package zstd
 import (
 	"fmt"
 )
 type buildDtableAsmContext struct {
 	// inputs
 	stateTable *uint16
 	norm       *int16
 	dt         *uint64
 	// outputs --- set by the procedure in the case of error;
 	// for interpretation please see the error handling part below
 	errParam1 uint64
 	errParam2 uint64
 }
 // buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
 // Function returns non-zero exit code on error.
 // go:noescape
 func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
 // please keep in sync with _generate/gen_fse.go
 const (
 	errorCorruptedNormalizedCounter = 1
 	errorNewStateTooBig             = 2
 	errorNewStateNoBits             = 3
 )
 // buildDtable will build the decoding table.
 func (s *fseDecoder) buildDtable() error {
 	ctx := buildDtableAsmContext{
 		stateTable: &s.stateTable[0],
 		norm:       &s.norm[0],
 		dt:         (*uint64)(&s.dt[0]),
 	}
 	code := buildDtable_asm(s, &ctx)
 	if code != 0 {
 		switch code {
 		case errorCorruptedNormalizedCounter:
 			position := ctx.errParam1
 			return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
 		case errorNewStateTooBig:
 			newState := decSymbol(ctx.errParam1)
 			size := ctx.errParam2
 			return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
 		case errorNewStateNoBits:
 			newState := decSymbol(ctx.errParam1)
 			oldState := decSymbol(ctx.errParam2)
 			return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
 		default:
 			return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
 		}
 	}
 	return nil
 }
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
@ -0,0 +1,127 @@
 // Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
 //go:build !appengine && !noasm && gc && !noasm
 // +build !appengine,!noasm,gc,!noasm
 // func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
 TEXT ·buildDtable_asm(SB), $0-24
 	MOVQ ctx+8(FP), CX
 	MOVQ s+0(FP), DI
 	// Load values
 	MOVBQZX 4098(DI), DX
 	XORQ    AX, AX
 	BTSQ    DX, AX
 	MOVQ    (CX), BX
 	MOVQ    16(CX), SI
 	LEAQ    -1(AX), R8
 	MOVQ    8(CX), CX
 	MOVWQZX 4096(DI), DI
 	// End load values
 	// Init, lay down lowprob symbols
 	XORQ R9, R9
 	JMP  init_main_loop_condition
 init_main_loop:
 	MOVWQSX (CX)(R9*2), R10
 	CMPW    R10, $-1
 	JNE     do_not_update_high_threshold
 	MOVB    R9, 1(SI)(R8*8)
 	DECQ    R8
 	MOVQ    $0x0000000000000001, R10
 do_not_update_high_threshold:
 	MOVW R10, (BX)(R9*2)
 	INCQ R9
 init_main_loop_condition:
 	CMPQ R9, DI
 	JL   init_main_loop
 	// Spread symbols
 	// Calculate table step
 	MOVQ AX, R9
 	SHRQ $0x01, R9
 	MOVQ AX, R10
 	SHRQ $0x03, R10
 	LEAQ 3(R9)(R10*1), R9
 	// Fill add bits values
 	LEAQ -1(AX), R10
 	XORQ R11, R11
 	XORQ R12, R12
 	JMP  spread_main_loop_condition
 spread_main_loop:
 	XORQ    R13, R13
 	MOVWQSX (CX)(R12*2), R14
 	JMP     spread_inner_loop_condition
 spread_inner_loop:
 	MOVB R12, 1(SI)(R11*8)
 adjust_position:
 	ADDQ R9, R11
 	ANDQ R10, R11
 	CMPQ R11, R8
 	JG   adjust_position
 	INCQ R13
 spread_inner_loop_condition:
 	CMPQ R13, R14
 	JL   spread_inner_loop
 	INCQ R12
 spread_main_loop_condition:
 	CMPQ  R12, DI
 	JL    spread_main_loop
 	TESTQ R11, R11
 	JZ    spread_check_ok
 	MOVQ  ctx+8(FP), AX
 	MOVQ  R11, 24(AX)
 	MOVQ  $+1, ret+16(FP)
 	RET
 spread_check_ok:
 	// Build Decoding table
 	XORQ DI, DI
 build_table_main_table:
 	MOVBQZX 1(SI)(DI*8), CX
 	MOVWQZX (BX)(CX*2), R8
 	LEAQ    1(R8), R9
 	MOVW    R9, (BX)(CX*2)
 	MOVQ    R8, R9
 	BSRQ    R9, R9
 	MOVQ    DX, CX
 	SUBQ    R9, CX
 	SHLQ    CL, R8
 	SUBQ    AX, R8
 	MOVB    CL, (SI)(DI*8)
 	MOVW    R8, 2(SI)(DI*8)
 	CMPQ    R8, AX
 	JLE     build_table_check1_ok
 	MOVQ    ctx+8(FP), CX
 	MOVQ    R8, 24(CX)
 	MOVQ    AX, 32(CX)
 	MOVQ    $+2, ret+16(FP)
 	RET
 build_table_check1_ok:
 	TESTB CL, CL
 	JNZ   build_table_check2_ok
 	CMPW  R8, DI
 	JNE   build_table_check2_ok
 	MOVQ  ctx+8(FP), AX
 	MOVQ  R8, 24(AX)
 	MOVQ  DI, 32(AX)
 	MOVQ  $+3, ret+16(FP)
 	RET
 build_table_check2_ok:
 	INCQ DI
 	CMPQ DI, AX
 	JL   build_table_main_table
 	MOVQ $+0, ret+16(FP)
 	RET
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
@ -0,0 +1,72 @@
 //go:build !amd64 || appengine || !gc || noasm
 // +build !amd64 appengine !gc noasm
 package zstd
 import (
 	"errors"
 	"fmt"
 )
 // buildDtable will build the decoding table.
 func (s *fseDecoder) buildDtable() error {
 	tableSize := uint32(1 << s.actualTableLog)
 	highThreshold := tableSize - 1
 	symbolNext := s.stateTable[:256]
 	// Init, lay down lowprob symbols
 	{
 		for i, v := range s.norm[:s.symbolLen] {
 			if v == -1 {
 				s.dt[highThreshold].setAddBits(uint8(i))
 				highThreshold--
 				symbolNext[i] = 1
 			} else {
 				symbolNext[i] = uint16(v)
 			}
 		}
 	}
 	// Spread symbols
 	{
 		tableMask := tableSize - 1
 		step := tableStep(tableSize)
 		position := uint32(0)
 		for ss, v := range s.norm[:s.symbolLen] {
 			for i := 0; i < int(v); i++ {
 				s.dt[position].setAddBits(uint8(ss))
 				position = (position + step) & tableMask
 				for position > highThreshold {
 					// lowprob area
 					position = (position + step) & tableMask
 				}
 			}
 		}
 		if position != 0 {
 			// position must reach all cells once, otherwise normalizedCounter is incorrect
 			return errors.New("corrupted input (position != 0)")
 		}
 	}
 	// Build Decoding table
 	{
 		tableSize := uint16(1 << s.actualTableLog)
 		for u, v := range s.dt[:tableSize] {
 			symbol := v.addBits()
 			nextState := symbolNext[symbol]
 			symbolNext[symbol] = nextState + 1
 			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
 			s.dt[u&maxTableMask].setNBits(nBits)
 			newState := (nextState << nBits) - tableSize
 			if newState > tableSize {
 				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
 			}
 			if newState == uint16(u) && nBits == 0 {
 				// Seems weird that this is possible with nbits > 0.
 				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
 			}
 			s.dt[u&maxTableMask].setNewState(newState)
 		}
 	}
 	return nil
 }
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@ -76,21 +76,6 @@ func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
 	s.clearCount = maxCount != 0
 }
 // prepare will prepare and allocate scratch tables used for both compression and decompression.
 func (s *fseEncoder) prepare() (*fseEncoder, error) {
 	if s == nil {
 		s = &fseEncoder{}
 	}
 	s.useRLE = false
 	if s.clearCount && s.maxCount == 0 {
 		for i := range s.count {
 			s.count[i] = 0
 		}
 		s.clearCount = false
 	}
 	return s, nil
 }
 // allocCtable will allocate tables needed for compression.
 // If existing tables a re big enough, they are simply re-used.
 func (s *fseEncoder) allocCtable() {
@ -709,14 +694,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
 	c.state = c.stateTable[lu]
 }
 // encode the output symbol provided and write it to the bitstream.
 func (c *cState) encode(symbolTT symbolTransform) {
 	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
 	dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
 	c.bw.addBits16NC(c.state, uint8(nbBitsOut))
 	c.state = c.stateTable[dstState]
 }
 // flush will write the tablelog to the output and flush the remaining full bytes.
 func (c *cState) flush(tableLog uint8) {
 	c.bw.flush32()
--- a/vendor/github.com/klauspost/compress/zstd/fuzz.go
+++ b/vendor/github.com/klauspost/compress/zstd/fuzz.go
@ -1,11 +0,0 @@
 //go:build ignorecrc
 // +build ignorecrc
 // Copyright 2019+ Klaus Post. All rights reserved.
 // License information can be found in the LICENSE file.
 // Based on work by Yann Collet, released under BSD License.
 package zstd
 // ignoreCRC can be used for fuzz testing to ignore CRC values...
 const ignoreCRC = true
--- a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
+++ b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
@ -1,11 +0,0 @@
 //go:build !ignorecrc
 // +build !ignorecrc
 // Copyright 2019+ Klaus Post. All rights reserved.
 // License information can be found in the LICENSE file.
 // Based on work by Yann Collet, released under BSD License.
 package zstd
 // ignoreCRC can be used for fuzz testing to ignore CRC values...
 const ignoreCRC = false
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@ -33,9 +33,3 @@ func hashLen(u uint64, length, mls uint8) uint32 {
 		return (uint32(u) * prime4bytes) >> (32 - length)
 	}
 }
 // hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash3(u uint32, h uint8) uint32 {
 	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
 }
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@ -73,6 +73,7 @@ type sequenceDecs struct {
 	seqSize      int
 	windowSize   int
 	maxBits      uint8
 	maxSyncLen   uint64
 }
 // initialize all 3 decoders from the stream input.
@ -98,153 +99,13 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
 	return nil
 }
 // decode sequences from the stream with the provided history.
 func (s *sequenceDecs) decode(seqs []seqVals) error {
 	br := s.br
 	// Grab full sizes tables, to avoid bounds checks.
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
 	s.seqSize = 0
 	litRemain := len(s.literals)
 	maxBlockSize := maxCompressedBlockSize
 	if s.windowSize < maxBlockSize {
 		maxBlockSize = s.windowSize
 	}
 	for i := range seqs {
 		var ll, mo, ml int
 		if br.off > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 			// Final will not read from stream.
 			var llB, mlB, moB uint8
 			ll, llB = llState.final()
 			ml, mlB = mlState.final()
 			mo, moB = ofState.final()
 			// extra bits are stored in reverse order.
 			br.fillFast()
 			mo += br.getBits(moB)
 			if s.maxBits > 32 {
 				br.fillFast()
 			}
 			ml += br.getBits(mlB)
 			ll += br.getBits(llB)
 			if moB > 1 {
 				s.prevOffset[2] = s.prevOffset[1]
 				s.prevOffset[1] = s.prevOffset[0]
 				s.prevOffset[0] = mo
 			} else {
 				// mo = s.adjustOffset(mo, ll, moB)
 				// Inlined for rather big speedup
 				if ll == 0 {
 					// There is an exception though, when current sequence's literals_length = 0.
 					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
 					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
 					mo++
 				}
 				if mo == 0 {
 					mo = s.prevOffset[0]
 				} else {
 					var temp int
 					if mo == 3 {
 						temp = s.prevOffset[0] - 1
 					} else {
 						temp = s.prevOffset[mo]
 					}
 					if temp == 0 {
 						// 0 is not valid; input is corrupted; force offset to 1
 						println("WARNING: temp was 0")
 						temp = 1
 					}
 					if mo != 1 {
 						s.prevOffset[2] = s.prevOffset[1]
 					}
 					s.prevOffset[1] = s.prevOffset[0]
 					s.prevOffset[0] = temp
 					mo = temp
 				}
 			}
 			br.fillFast()
 		} else {
 			if br.overread() {
 				if debugDecoder {
 					printf("reading sequence %d, exceeded available data\n", i)
 				}
 				return io.ErrUnexpectedEOF
 			}
 			ll, mo, ml = s.next(br, llState, mlState, ofState)
 			br.fill()
 		}
 		if debugSequences {
 			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
 		}
 		// Evaluate.
 		// We might be doing this async, so do it early.
 		if mo == 0 && ml > 0 {
 			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
 		}
 		if ml > maxMatchLen {
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 		s.seqSize += ll + ml
 		if s.seqSize > maxBlockSize {
 			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
 		}
 		litRemain -= ll
 		if litRemain < 0 {
 			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
 		}
 		seqs[i] = seqVals{
 			ll: ll,
 			ml: ml,
 			mo: mo,
 		}
 		if i == len(seqs)-1 {
 			// This is the last sequence, so we shouldn't update state.
 			break
 		}
 		// Manually inlined, ~ 5-20% faster
 		// Update all 3 states at once. Approx 20% faster.
 		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
 		if nBits == 0 {
 			llState = llTable[llState.newState()&maxTableMask]
 			mlState = mlTable[mlState.newState()&maxTableMask]
 			ofState = ofTable[ofState.newState()&maxTableMask]
 		} else {
 			bits := br.get32BitsFast(nBits)
 			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
 			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
 			lowBits = uint16(bits >> (ofState.nbBits() & 31))
 			lowBits &= bitMask[mlState.nbBits()&15]
 			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
 			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
 			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
 		}
 	}
 	s.seqSize += litRemain
 	if s.seqSize > maxBlockSize {
 		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
 		printf("Closing sequences: %v, %+v\n", err, *br)
 	}
 	return err
 }
 // execute will execute the decoded sequence with the provided history.
 // The sequence must be evaluated before being sent.
 func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
 	if len(s.dict) == 0 {
 		return s.executeSimple(seqs, hist)
 	}
 	// Ensure we have enough output size...
 	if len(s.out)+s.seqSize > cap(s.out) {
 		addBytes := s.seqSize + len(s.out)
@ -327,6 +188,7 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
 			}
 		}
 	}
 	// Add final literals
 	copy(out[t:], s.literals)
 	if debugDecoder {
@ -341,14 +203,18 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
 }
 // decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decodeSync(history *history) error {
+func (s *sequenceDecs) decodeSync(hist []byte) error {
 	supported, err := s.decodeSyncSimple(hist)
 	if supported {
 		return err
 	}
 	br := s.br
 	seqs := s.nSeqs
 	startSize := len(s.out)
 	// Grab full sizes tables, to avoid bounds checks.
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
 	hist := history.b[history.ignoreBuffer:]
 	out := s.out
 	maxBlockSize := maxCompressedBlockSize
 	if s.windowSize < maxBlockSize {
@ -433,7 +299,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 		}
 		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
 		}
 		if size > cap(out) {
 			// Not enough size, which can happen under high volume block streaming conditions
@ -463,13 +329,13 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 		if mo > len(out)+len(hist) || mo > s.windowSize {
 			if len(s.dict) == 0 {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 			// we may be in dictionary.
 			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
 			if dictO < 0 || dictO >= len(s.dict) {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 			end := dictO + ml
 			if end > len(s.dict) {
@ -530,6 +396,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 			ofState = ofTable[ofState.newState()&maxTableMask]
 		} else {
 			bits := br.get32BitsFast(nBits)
 			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
 			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
@ -543,8 +410,8 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 	}
 	// Check if space for literals
-	if len(s.literals)+len(s.out)-startSize > maxBlockSize {
+	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", len(s.out), maxBlockSize)
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
 	}
 	// Add final literals
@ -552,16 +419,6 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 	return br.close()
 }
 // update states, at least 27 bits must be available.
 func (s *sequenceDecs) update(br *bitReader) {
 	// Max 8 bits
 	s.litLengths.state.next(br)
 	// Max 9 bits
 	s.matchLengths.state.next(br)
 	// Max 8 bits
 	s.offsets.state.next(br)
 }
 var bitMask [16]uint16
 func init() {
@ -570,87 +427,6 @@ func init() {
 	}
 }
 // update states, at least 27 bits must be available.
 func (s *sequenceDecs) updateAlt(br *bitReader) {
 	// Update all 3 states at once. Approx 20% faster.
 	a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
 	nBits := a.nbBits() + b.nbBits() + c.nbBits()
 	if nBits == 0 {
 		s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
 		s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
 		s.offsets.state.state = s.offsets.state.dt[c.newState()]
 		return
 	}
 	bits := br.get32BitsFast(nBits)
 	lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
 	s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
 	lowBits = uint16(bits >> (c.nbBits() & 31))
 	lowBits &= bitMask[b.nbBits()&15]
 	s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
 	lowBits = uint16(bits) & bitMask[c.nbBits()&15]
 	s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
 }
 // nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
 func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
 	// Final will not read from stream.
 	ll, llB := llState.final()
 	ml, mlB := mlState.final()
 	mo, moB := ofState.final()
 	// extra bits are stored in reverse order.
 	br.fillFast()
 	mo += br.getBits(moB)
 	if s.maxBits > 32 {
 		br.fillFast()
 	}
 	ml += br.getBits(mlB)
 	ll += br.getBits(llB)
 	if moB > 1 {
 		s.prevOffset[2] = s.prevOffset[1]
 		s.prevOffset[1] = s.prevOffset[0]
 		s.prevOffset[0] = mo
 		return
 	}
 	// mo = s.adjustOffset(mo, ll, moB)
 	// Inlined for rather big speedup
 	if ll == 0 {
 		// There is an exception though, when current sequence's literals_length = 0.
 		// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
 		// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
 		mo++
 	}
 	if mo == 0 {
 		mo = s.prevOffset[0]
 		return
 	}
 	var temp int
 	if mo == 3 {
 		temp = s.prevOffset[0] - 1
 	} else {
 		temp = s.prevOffset[mo]
 	}
 	if temp == 0 {
 		// 0 is not valid; input is corrupted; force offset to 1
 		println("temp was 0")
 		temp = 1
 	}
 	if mo != 1 {
 		s.prevOffset[2] = s.prevOffset[1]
 	}
 	s.prevOffset[1] = s.prevOffset[0]
 	s.prevOffset[0] = temp
 	mo = temp
 	return
 }
 func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
 	// Final will not read from stream.
 	ll, llB := llState.final()
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@ -0,0 +1,368 @@
 //go:build amd64 && !appengine && !noasm && gc
 // +build amd64,!appengine,!noasm,gc
 package zstd
 import (
 	"fmt"
 	"github.com/klauspost/compress/internal/cpuinfo"
 )
 type decodeSyncAsmContext struct {
 	llTable     []decSymbol
 	mlTable     []decSymbol
 	ofTable     []decSymbol
 	llState     uint64
 	mlState     uint64
 	ofState     uint64
 	iteration   int
 	litRemain   int
 	out         []byte
 	outPosition int
 	literals    []byte
 	litPosition int
 	history     []byte
 	windowSize  int
 	ll          int // set on error (not for all errors, please refer to _generate/gen.go)
 	ml          int // set on error (not for all errors, please refer to _generate/gen.go)
 	mo          int // set on error (not for all errors, please refer to _generate/gen.go)
 }
 // sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
 //go:noescape
 func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 // sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
 //go:noescape
 func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 // sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
 //go:noescape
 func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 // sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
 //go:noescape
 func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 // decode sequences from the stream with the provided history but without a dictionary.
 func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
 	if len(s.dict) > 0 {
 		return false, nil
 	}
 	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
 		return false, nil
 	}
 	// FIXME: Using unsafe memory copies leads to rare, random crashes
 	// with fuzz testing. It is therefore disabled for now.
 	const useSafe = true
 	/*
 		useSafe := false
 		if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
 			useSafe = true
 		}
 		if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
 			useSafe = true
 		}
 		if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
 			useSafe = true
 		}
 	*/
 	br := s.br
 	maxBlockSize := maxCompressedBlockSize
 	if s.windowSize < maxBlockSize {
 		maxBlockSize = s.windowSize
 	}
 	ctx := decodeSyncAsmContext{
 		llTable:     s.litLengths.fse.dt[:maxTablesize],
 		mlTable:     s.matchLengths.fse.dt[:maxTablesize],
 		ofTable:     s.offsets.fse.dt[:maxTablesize],
 		llState:     uint64(s.litLengths.state.state),
 		mlState:     uint64(s.matchLengths.state.state),
 		ofState:     uint64(s.offsets.state.state),
 		iteration:   s.nSeqs - 1,
 		litRemain:   len(s.literals),
 		out:         s.out,
 		outPosition: len(s.out),
 		literals:    s.literals,
 		windowSize:  s.windowSize,
 		history:     hist,
 	}
 	s.seqSize = 0
 	startSize := len(s.out)
 	var errCode int
 	if cpuinfo.HasBMI2() {
 		if useSafe {
 			errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
 		} else {
 			errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
 		}
 	} else {
 		if useSafe {
 			errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
 		} else {
 			errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
 		}
 	}
 	switch errCode {
 	case noError:
 		break
 	case errorMatchLenOfsMismatch:
 		return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
 	case errorMatchLenTooBig:
 		return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
 	case errorMatchOffTooBig:
 		return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
 			ctx.mo, ctx.outPosition+len(hist)-startSize)
 	case errorNotEnoughLiterals:
 		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
 			ctx.ll, ctx.litRemain+ctx.ll)
 	case errorNotEnoughSpace:
 		size := ctx.outPosition + ctx.ll + ctx.ml
 		if debugDecoder {
 			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
 		}
 		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
 	default:
 		return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
 	}
 	s.seqSize += ctx.litRemain
 	if s.seqSize > maxBlockSize {
 		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
 		printf("Closing sequences: %v, %+v\n", err, *br)
 		return true, err
 	}
 	s.literals = s.literals[ctx.litPosition:]
 	t := ctx.outPosition
 	s.out = s.out[:t]
 	// Add final literals
 	s.out = append(s.out, s.literals...)
 	if debugDecoder {
 		t += len(s.literals)
 		if t != len(s.out) {
 			panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
 		}
 	}
 	return true, nil
 }
 // --------------------------------------------------------------------------------
 type decodeAsmContext struct {
 	llTable   []decSymbol
 	mlTable   []decSymbol
 	ofTable   []decSymbol
 	llState   uint64
 	mlState   uint64
 	ofState   uint64
 	iteration int
 	seqs      []seqVals
 	litRemain int
 }
 const noError = 0
 // error reported when mo == 0 && ml > 0
 const errorMatchLenOfsMismatch = 1
 // error reported when ml > maxMatchLen
 const errorMatchLenTooBig = 2
 // error reported when mo > available history or mo > s.windowSize
 const errorMatchOffTooBig = 3
 // error reported when the sum of literal lengths exeeceds the literal buffer size
 const errorNotEnoughLiterals = 4
 // error reported when capacity of `out` is too small
 const errorNotEnoughSpace = 5
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
 //go:noescape
 func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
 //go:noescape
 func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
 //go:noescape
 func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
 //go:noescape
 func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // decode sequences from the stream without the provided history.
 func (s *sequenceDecs) decode(seqs []seqVals) error {
 	br := s.br
 	maxBlockSize := maxCompressedBlockSize
 	if s.windowSize < maxBlockSize {
 		maxBlockSize = s.windowSize
 	}
 	ctx := decodeAsmContext{
 		llTable:   s.litLengths.fse.dt[:maxTablesize],
 		mlTable:   s.matchLengths.fse.dt[:maxTablesize],
 		ofTable:   s.offsets.fse.dt[:maxTablesize],
 		llState:   uint64(s.litLengths.state.state),
 		mlState:   uint64(s.matchLengths.state.state),
 		ofState:   uint64(s.offsets.state.state),
 		seqs:      seqs,
 		iteration: len(seqs) - 1,
 		litRemain: len(s.literals),
 	}
 	s.seqSize = 0
 	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
 	var errCode int
 	if cpuinfo.HasBMI2() {
 		if lte56bits {
 			errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
 		} else {
 			errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
 		}
 	} else {
 		if lte56bits {
 			errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
 		} else {
 			errCode = sequenceDecs_decode_amd64(s, br, &ctx)
 		}
 	}
 	if errCode != 0 {
 		i := len(seqs) - ctx.iteration - 1
 		switch errCode {
 		case errorMatchLenOfsMismatch:
 			ml := ctx.seqs[i].ml
 			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
 		case errorMatchLenTooBig:
 			ml := ctx.seqs[i].ml
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		case errorNotEnoughLiterals:
 			ll := ctx.seqs[i].ll
 			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
 		}
 		return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
 	}
 	if ctx.litRemain < 0 {
 		return fmt.Errorf("literal count is too big: total available %d, total requested %d",
 			len(s.literals), len(s.literals)-ctx.litRemain)
 	}
 	s.seqSize += ctx.litRemain
 	if s.seqSize > maxBlockSize {
 		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
 		printf("Closing sequences: %v, %+v\n", err, *br)
 	}
 	return err
 }
 // --------------------------------------------------------------------------------
 type executeAsmContext struct {
 	seqs        []seqVals
 	seqIndex    int
 	out         []byte
 	history     []byte
 	literals    []byte
 	outPosition int
 	litPosition int
 	windowSize  int
 }
 // sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
 //
 // Returns false if a match offset is too big.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
 //go:noescape
 func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
 // Same as above, but with safe memcopies
 //go:noescape
 func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
 // executeSimple handles cases when dictionary is not used.
 func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
 	// Ensure we have enough output size...
 	if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
 		addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
 		s.out = append(s.out, make([]byte, addBytes)...)
 		s.out = s.out[:len(s.out)-addBytes]
 	}
 	if debugDecoder {
 		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
 	}
 	var t = len(s.out)
 	out := s.out[:t+s.seqSize]
 	ctx := executeAsmContext{
 		seqs:        seqs,
 		seqIndex:    0,
 		out:         out,
 		history:     hist,
 		outPosition: t,
 		litPosition: 0,
 		literals:    s.literals,
 		windowSize:  s.windowSize,
 	}
 	var ok bool
 	if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
 		ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
 	} else {
 		ok = sequenceDecs_executeSimple_amd64(&ctx)
 	}
 	if !ok {
 		return fmt.Errorf("match offset (%d) bigger than current history (%d)",
 			seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
 	}
 	s.literals = s.literals[ctx.litPosition:]
 	t = ctx.outPosition
 	// Add final literals
 	copy(out[t:], s.literals)
 	if debugDecoder {
 		t += len(s.literals)
 		if t != len(out) {
 			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
 		}
 	}
 	s.out = out
 	return nil
 }
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@ -0,0 +1,237 @@
 //go:build !amd64 || appengine || !gc || noasm
 // +build !amd64 appengine !gc noasm
 package zstd
 import (
 	"fmt"
 	"io"
 )
 // decode sequences from the stream with the provided history but without dictionary.
 func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
 	return false, nil
 }
 // decode sequences from the stream without the provided history.
 func (s *sequenceDecs) decode(seqs []seqVals) error {
 	br := s.br
 	// Grab full sizes tables, to avoid bounds checks.
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
 	s.seqSize = 0
 	litRemain := len(s.literals)
 	maxBlockSize := maxCompressedBlockSize
 	if s.windowSize < maxBlockSize {
 		maxBlockSize = s.windowSize
 	}
 	for i := range seqs {
 		var ll, mo, ml int
 		if br.off > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 			// Final will not read from stream.
 			var llB, mlB, moB uint8
 			ll, llB = llState.final()
 			ml, mlB = mlState.final()
 			mo, moB = ofState.final()
 			// extra bits are stored in reverse order.
 			br.fillFast()
 			mo += br.getBits(moB)
 			if s.maxBits > 32 {
 				br.fillFast()
 			}
 			ml += br.getBits(mlB)
 			ll += br.getBits(llB)
 			if moB > 1 {
 				s.prevOffset[2] = s.prevOffset[1]
 				s.prevOffset[1] = s.prevOffset[0]
 				s.prevOffset[0] = mo
 			} else {
 				// mo = s.adjustOffset(mo, ll, moB)
 				// Inlined for rather big speedup
 				if ll == 0 {
 					// There is an exception though, when current sequence's literals_length = 0.
 					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
 					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
 					mo++
 				}
 				if mo == 0 {
 					mo = s.prevOffset[0]
 				} else {
 					var temp int
 					if mo == 3 {
 						temp = s.prevOffset[0] - 1
 					} else {
 						temp = s.prevOffset[mo]
 					}
 					if temp == 0 {
 						// 0 is not valid; input is corrupted; force offset to 1
 						println("WARNING: temp was 0")
 						temp = 1
 					}
 					if mo != 1 {
 						s.prevOffset[2] = s.prevOffset[1]
 					}
 					s.prevOffset[1] = s.prevOffset[0]
 					s.prevOffset[0] = temp
 					mo = temp
 				}
 			}
 			br.fillFast()
 		} else {
 			if br.overread() {
 				if debugDecoder {
 					printf("reading sequence %d, exceeded available data\n", i)
 				}
 				return io.ErrUnexpectedEOF
 			}
 			ll, mo, ml = s.next(br, llState, mlState, ofState)
 			br.fill()
 		}
 		if debugSequences {
 			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
 		}
 		// Evaluate.
 		// We might be doing this async, so do it early.
 		if mo == 0 && ml > 0 {
 			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
 		}
 		if ml > maxMatchLen {
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 		s.seqSize += ll + ml
 		if s.seqSize > maxBlockSize {
 			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
 		}
 		litRemain -= ll
 		if litRemain < 0 {
 			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
 		}
 		seqs[i] = seqVals{
 			ll: ll,
 			ml: ml,
 			mo: mo,
 		}
 		if i == len(seqs)-1 {
 			// This is the last sequence, so we shouldn't update state.
 			break
 		}
 		// Manually inlined, ~ 5-20% faster
 		// Update all 3 states at once. Approx 20% faster.
 		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
 		if nBits == 0 {
 			llState = llTable[llState.newState()&maxTableMask]
 			mlState = mlTable[mlState.newState()&maxTableMask]
 			ofState = ofTable[ofState.newState()&maxTableMask]
 		} else {
 			bits := br.get32BitsFast(nBits)
 			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
 			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
 			lowBits = uint16(bits >> (ofState.nbBits() & 31))
 			lowBits &= bitMask[mlState.nbBits()&15]
 			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
 			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
 			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
 		}
 	}
 	s.seqSize += litRemain
 	if s.seqSize > maxBlockSize {
 		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
 		printf("Closing sequences: %v, %+v\n", err, *br)
 	}
 	return err
 }
 // executeSimple handles cases when a dictionary is not used.
 func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
 	// Ensure we have enough output size...
 	if len(s.out)+s.seqSize > cap(s.out) {
 		addBytes := s.seqSize + len(s.out)
 		s.out = append(s.out, make([]byte, addBytes)...)
 		s.out = s.out[:len(s.out)-addBytes]
 	}
 	if debugDecoder {
 		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
 	}
 	var t = len(s.out)
 	out := s.out[:t+s.seqSize]
 	for _, seq := range seqs {
 		// Add literals
 		copy(out[t:], s.literals[:seq.ll])
 		t += seq.ll
 		s.literals = s.literals[seq.ll:]
 		// Malformed input
 		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
 			return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
 		}
 		// Copy from history.
 		if v := seq.mo - t; v > 0 {
 			// v is the start position in history from end.
 			start := len(hist) - v
 			if seq.ml > v {
 				// Some goes into the current block.
 				// Copy remainder of history
 				copy(out[t:], hist[start:])
 				t += v
 				seq.ml -= v
 			} else {
 				copy(out[t:], hist[start:start+seq.ml])
 				t += seq.ml
 				continue
 			}
 		}
 		// We must be in the current buffer now
 		if seq.ml > 0 {
 			start := t - seq.mo
 			if seq.ml <= t-start {
 				// No overlap
 				copy(out[t:], out[start:start+seq.ml])
 				t += seq.ml
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
 				src := out[start : start+seq.ml]
 				dst := out[t:]
 				dst = dst[:len(src)]
 				t += len(src)
 				// Destination is the space we just added.
 				for i := range src {
 					dst[i] = src[i]
 				}
 			}
 		}
 	}
 	// Add final literals
 	copy(out[t:], s.literals)
 	if debugDecoder {
 		t += len(s.literals)
 		if t != len(out) {
 			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
 		}
 	}
 	s.out = out
 	return nil
 }
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@ -18,26 +18,44 @@ const ZipMethodWinZip = 93
 // See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
 const ZipMethodPKWare = 20
-var zipReaderPool sync.Pool
+// zipReaderPool is the default reader pool.
 var zipReaderPool = sync.Pool{New: func() interface{} {
 	z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1))
 	if err != nil {
 		panic(err)
 	}
 	return z
 }}
 // newZipReader creates a pooled zip decompressor.
-func newZipReader(r io.Reader) io.ReadCloser {
+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
-	dec, ok := zipReaderPool.Get().(*Decoder)
+	pool := &zipReaderPool
-	if ok {
+	if len(opts) > 0 {
-		dec.Reset(r)
+		opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
-	} else {
+		// Force concurrency 1
-		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
+		opts = append(opts, WithDecoderConcurrency(1))
-		if err != nil {
+		// Create our own pool
-			panic(err)
+		pool = &sync.Pool{}
-		}
+	}
-		dec = d
+	return func(r io.Reader) io.ReadCloser {
 		dec, ok := pool.Get().(*Decoder)
 		if ok {
 			dec.Reset(r)
 		} else {
 			d, err := NewReader(r, opts...)
 			if err != nil {
 				panic(err)
 			}
 			dec = d
 		}
 		return &pooledZipReader{dec: dec, pool: pool}
 	}
 	return &pooledZipReader{dec: dec}
 }
 type pooledZipReader struct {
-	mu  sync.Mutex // guards Close and Read
+	mu   sync.Mutex // guards Close and Read
-	dec *Decoder
+	pool *sync.Pool
 	dec  *Decoder
 }
 func (r *pooledZipReader) Read(p []byte) (n int, err error) {
@ -48,8 +66,8 @@ func (r *pooledZipReader) Read(p []byte) (n int, err error) {
 	}
 	dec, err := r.dec.Read(p)
 	if err == io.EOF {
-		err = r.dec.Reset(nil)
+		r.dec.Reset(nil)
-		zipReaderPool.Put(r.dec)
+		r.pool.Put(r.dec)
 		r.dec = nil
 	}
 	return dec, err
@ -61,7 +79,7 @@ func (r *pooledZipReader) Close() error {
 	var err error
 	if r.dec != nil {
 		err = r.dec.Reset(nil)
-		zipReaderPool.Put(r.dec)
+		r.pool.Put(r.dec)
 		r.dec = nil
 	}
 	return err
@ -115,6 +133,9 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
 // ZipDecompressor returns a decompressor that can be registered with zip libraries.
 // See ZipCompressor for example.
-func ZipDecompressor() func(r io.Reader) io.ReadCloser {
+// Options can be specified. WithDecoderConcurrency(1) is forced,
-	return newZipReader
+// and by default a 128MB maximum decompression window is specified.
 // The window size can be overridden if required.
 func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
 	return newZipReader(opts...)
 }
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@ -110,17 +110,6 @@ func printf(format string, a ...interface{}) {
 	}
 }
 // matchLenFast does matching, but will not match the last up to 7 bytes.
 func matchLenFast(a, b []byte) int {
 	endI := len(a) & (math.MaxInt32 - 7)
 	for i := 0; i < endI; i += 8 {
 		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
 			return i + bits.TrailingZeros64(diff)>>3
 		}
 	}
 	return endI
 }
 // matchLen returns the maximum length.
 // a must be the shortest of the two.
 // The function also returns whether all bytes matched.
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -133,11 +133,12 @@ github.com/imdario/mergo
 # github.com/inconshreveable/mousetrap v1.0.0
 ## explicit
 github.com/inconshreveable/mousetrap
-# github.com/klauspost/compress v1.15.1
+# github.com/klauspost/compress v1.15.9
-## explicit; go 1.15
+## explicit; go 1.16
 github.com/klauspost/compress
 github.com/klauspost/compress/fse
 github.com/klauspost/compress/huff0
 github.com/klauspost/compress/internal/cpuinfo
 github.com/klauspost/compress/internal/snapref
 github.com/klauspost/compress/zstd
 github.com/klauspost/compress/zstd/internal/xxhash