From 7df29980d649043ddb9e4082de281fcc0db75329 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 1 Aug 2023 11:35:28 +0200 Subject: [PATCH] flate: Add limited window compression Adds a medium compressor that can operate with limited window size. Exposed in gzip outside deflate for now. Example sizes: ``` === RUN TestFileWindow/32 gzip_test.go:349: size: 82504 bytes === RUN TestFileWindow/64 gzip_test.go:349: size: 75350 bytes === RUN TestFileWindow/128 gzip_test.go:349: size: 70668 bytes === RUN TestFileWindow/256 gzip_test.go:349: size: 69276 bytes === RUN TestFileWindow/512 gzip_test.go:349: size: 68327 bytes === RUN TestFileWindow/1024 gzip_test.go:349: size: 67876 bytes === RUN TestFileWindow/2048 gzip_test.go:349: size: 40900 bytes === RUN TestFileWindow/4096 gzip_test.go:349: size: 38684 bytes === RUN TestFileWindow/8192 gzip_test.go:349: size: 36263 bytes === RUN TestFileWindow/16384 gzip_test.go:349: size: 35434 bytes === RUN TestFileWindow/32768 gzip_test.go:349: size: 34654 bytes --- PASS: TestFileWindow (0.03s) ``` Limited testing done. --- flate/deflate.go | 29 ++++ flate/fuzz_test.go | 189 ++++++++++++-------- flate/level5.go | 398 +++++++++++++++++++++++++++++++++++++++++++ flate/reader_test.go | 1 + flate/writer_test.go | 4 + gzip/gzip.go | 21 +++ gzip/gzip_test.go | 61 ++++++- 7 files changed, 631 insertions(+), 72 deletions(-) diff --git a/flate/deflate.go b/flate/deflate.go index 5faea0b2b3..de912e187c 100644 --- a/flate/deflate.go +++ b/flate/deflate.go @@ -7,6 +7,7 @@ package flate import ( "encoding/binary" + "errors" "fmt" "io" "math" @@ -833,6 +834,12 @@ func (d *compressor) init(w io.Writer, level int) (err error) { d.initDeflate() d.fill = (*compressor).fillDeflate d.step = (*compressor).deflateLazy + case -level >= MinCustomWindowSize && -level <= MaxCustomWindowSize: + d.w.logNewTablePenalty = 7 + d.fast = &fastEncL5Window{maxOffset: int32(-level), cur: maxStoreBlockSize} + d.window = make([]byte, maxStoreBlockSize) + d.fill = (*compressor).fillBlock + d.step = (*compressor).storeFast default: return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level) } @@ -929,6 +936,28 @@ func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) { return zw, err } +// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow. +const MinCustomWindowSize = 32 + +// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow. +const MaxCustomWindowSize = windowSize + +// NewWriterWindow returns a new Writer compressing data with a custom window size. +// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize. +func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) { + if windowSize < MinCustomWindowSize { + return nil, errors.New("flate: requested window size less than MinWindowSize") + } + if windowSize > MaxCustomWindowSize { + return nil, errors.New("flate: requested window size bigger than MaxCustomWindowSize") + } + var dw Writer + if err := dw.d.init(w, -windowSize); err != nil { + return nil, err + } + return &dw, nil +} + // A Writer takes data written to it and writes the compressed // form of that data to an underlying writer (see NewWriter). type Writer struct { diff --git a/flate/fuzz_test.go b/flate/fuzz_test.go index 527bad25d1..cdda0f5ce7 100644 --- a/flate/fuzz_test.go +++ b/flate/fuzz_test.go @@ -19,6 +19,7 @@ var fuzzStartF = flag.Int("start", HuffmanOnly, "Start fuzzing at this level") var fuzzEndF = flag.Int("end", BestCompression, "End fuzzing at this level (inclusive)") var fuzzMaxF = flag.Int("max", 1<<20, "Maximum input size") var fuzzSLF = flag.Bool("sl", true, "Include stateless encodes") +var fuzzWindow = flag.Bool("windows", true, "Include windowed encodes") func TestMain(m *testing.M) { flag.Parse() @@ -34,6 +35,7 @@ func FuzzEncoding(f *testing.F) { endFuzz := *fuzzEndF maxSize := *fuzzMaxF stateless := *fuzzSLF + fuzzWindow := *fuzzWindow decoder := NewReader(nil) buf := new(bytes.Buffer) @@ -52,77 +54,130 @@ func FuzzEncoding(f *testing.F) { } for level := startFuzz; level <= endFuzz; level++ { msg := "level " + strconv.Itoa(level) + ":" - buf.Reset() - fw := encs[level-startFuzz] - fw.Reset(buf) - n, err := fw.Write(data) - if n != len(data) { - t.Fatal(msg + "short write") - } - if err != nil { - t.Fatal(msg + err.Error()) - } - err = fw.Close() - if err != nil { - t.Fatal(msg + err.Error()) - } - decoder.(Resetter).Reset(buf, nil) - data2, err := io.ReadAll(decoder) - if err != nil { - t.Fatal(msg + err.Error()) - } - if !bytes.Equal(data, data2) { - t.Fatal(msg + "not equal") - } - // Do it again... - msg = "level " + strconv.Itoa(level) + " (reset):" - buf.Reset() - fw.Reset(buf) - n, err = fw.Write(data) - if n != len(data) { - t.Fatal(msg + "short write") - } - if err != nil { - t.Fatal(msg + err.Error()) - } - err = fw.Close() - if err != nil { - t.Fatal(msg + err.Error()) - } - decoder.(Resetter).Reset(buf, nil) - data2, err = io.ReadAll(decoder) - if err != nil { - t.Fatal(msg + err.Error()) - } - if !bytes.Equal(data, data2) { - t.Fatal(msg + "not equal") - } - } - if !stateless { - return - } - // Split into two and use history... - buf.Reset() - err := StatelessDeflate(buf, data[:len(data)/2], false, nil) - if err != nil { - t.Error(err) + t.Run(msg, func(t *testing.T) { + buf.Reset() + fw := encs[level-startFuzz] + fw.Reset(buf) + n, err := fw.Write(data) + if n != len(data) { + t.Fatal(msg + "short write") + } + if err != nil { + t.Fatal(msg + err.Error()) + } + err = fw.Close() + if err != nil { + t.Fatal(msg + err.Error()) + } + decoder.(Resetter).Reset(buf, nil) + data2, err := io.ReadAll(decoder) + if err != nil { + t.Fatal(msg + err.Error()) + } + if !bytes.Equal(data, data2) { + t.Fatal(msg + "not equal") + } + // Do it again... + msg = "level " + strconv.Itoa(level) + " (reset):" + buf.Reset() + fw.Reset(buf) + n, err = fw.Write(data) + if n != len(data) { + t.Fatal(msg + "short write") + } + if err != nil { + t.Fatal(msg + err.Error()) + } + err = fw.Close() + if err != nil { + t.Fatal(msg + err.Error()) + } + decoder.(Resetter).Reset(buf, nil) + data2, err = io.ReadAll(decoder) + if err != nil { + t.Fatal(msg + err.Error()) + } + if !bytes.Equal(data, data2) { + t.Fatal(msg + "not equal") + } + }) } + if stateless { + t.Run("stateless", func(t *testing.T) { + // Split into two and use history... + buf.Reset() + err := StatelessDeflate(buf, data[:len(data)/2], false, nil) + if err != nil { + t.Error(err) + } - // Use top half as dictionary... - dict := data[:len(data)/2] - err = StatelessDeflate(buf, data[len(data)/2:], true, dict) - if err != nil { - t.Error(err) - } + // Use top half as dictionary... + dict := data[:len(data)/2] + err = StatelessDeflate(buf, data[len(data)/2:], true, dict) + if err != nil { + t.Error(err) + } - decoder.(Resetter).Reset(buf, nil) - data2, err := io.ReadAll(decoder) - if err != nil { - t.Error(err) + decoder.(Resetter).Reset(buf, nil) + data2, err := io.ReadAll(decoder) + if err != nil { + t.Error(err) + } + if !bytes.Equal(data, data2) { + //fmt.Printf("want:%x\ngot: %x\n", data1, data2) + t.Error("not equal") + } + }) } - if !bytes.Equal(data, data2) { - //fmt.Printf("want:%x\ngot: %x\n", data1, data2) - t.Error("not equal") + if fuzzWindow { + t.Run("window", func(t *testing.T) { + msg := "windowed" + buf.Reset() + fw, err := NewWriterWindow(buf, 1000) + if err != nil { + t.Fatal(msg + err.Error()) + } + fw.Reset(buf) + n, err := fw.Write(data) + if n != len(data) { + t.Fatal(msg + "short write") + } + if err != nil { + t.Fatal(msg + err.Error()) + } + err = fw.Close() + if err != nil { + t.Fatal(msg + err.Error()) + } + decoder.(Resetter).Reset(buf, nil) + data2, err := io.ReadAll(decoder) + if err != nil { + t.Fatal(msg + err.Error()) + } + if !bytes.Equal(data, data2) { + t.Fatal(msg + "not equal") + } + // Do it again... + msg = msg + " (reset):" + buf.Reset() + fw.Reset(buf) + n, err = fw.Write(data) + if n != len(data) { + t.Fatal(msg + "short write") + } + if err != nil { + t.Fatal(msg + err.Error()) + } + err = fw.Close() + if err != nil { + t.Fatal(msg + err.Error()) + } + decoder.(Resetter).Reset(buf, nil) + data2, err = io.ReadAll(decoder) + if err != nil { + t.Fatal(msg + err.Error()) + } + }) } }) } diff --git a/flate/level5.go b/flate/level5.go index 83ef50ba45..1f61ec1829 100644 --- a/flate/level5.go +++ b/flate/level5.go @@ -308,3 +308,401 @@ emitRemainder: emitLiteral(dst, src[nextEmit:]) } } + +// fastEncL5Window is a level 5 encoder, +// but with a custom window size. +type fastEncL5Window struct { + hist []byte + cur int32 + maxOffset int32 + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL5Window) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 + ) + maxMatchOffset := e.maxOffset + if debugDeflate && e.cur < 0 { + panic(fmt.Sprint("e.cur < 0: ", e.cur)) + } + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load6432(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hashLen(cv, tableBits, hashShortBytes) + nextHashL := hash7(cv, tableBits) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := load6432(src, nextS) + entry := tableEntry{offset: s + e.cur} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + nextHashS = hashLen(next, tableBits, hashShortBytes) + nextHashL = hash7(next, tableBits) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) { + l = e.matchlen(s+4, t+4, src) + 4 + ml1 := e.matchlen(s+4, t2+4, src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) { + // Found a 4 match... + l = e.matchlen(s+4, t+4, src) + 4 + lCandidate = e.bTable[nextHashL] + // Store the next match + + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + // If the next long is a candidate, use that... + t2 := lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + if l == 0 { + // Extend the 4-byte match as long as possible. + l = e.matchlenLong(s+4, t+4, src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(s+l, t+l, src) + } + + // Try to locate a better match by checking the end of best match... + if sAt := s + l; l < 30 && sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 + eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset + t2 := eLong - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 + if t2 >= 0 && off < maxMatchOffset && off > 0 { + if l2 := e.matchlenLong(s2, t2, src); l2 > l { + t = t2 + l = l2 + s = s2 + } + } + } + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + if false { + emitLiteral(dst, src[nextEmit:s]) + } else { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + } + if debugDeflate { + if t >= s { + panic(fmt.Sprintln("s-t", s, t)) + } + if (s - t) > maxMatchOffset { + panic(fmt.Sprintln("mmo", s-t)) + } + if l < baseMatchLength { + panic("bml") + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + goto emitRemainder + } + + // Store every 3rd hash in-between. + if true { + const hashEvery = 3 + i := s - l + 1 + if i < s-1 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur} + e.table[hashLen(cv, tableBits, hashShortBytes)] = t + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // Do an long at i+1 + cv >>= 8 + t = tableEntry{offset: t.offset + 1} + eLong = &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // We only have enough bits for a short entry at i+2 + cv >>= 8 + t = tableEntry{offset: t.offset + 1} + e.table[hashLen(cv, tableBits, hashShortBytes)] = t + + // Skip one - otherwise we risk hitting 's' + i += 4 + for ; i < s-1; i += hashEvery { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 + } + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := load6432(src, s-1) + o := e.cur + s - 1 + prevHashS := hashLen(x, tableBits, hashShortBytes) + prevHashL := hash7(x, tableBits) + e.table[prevHashS] = tableEntry{offset: o} + eLong := &e.bTable[prevHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} + +// Reset the encoding table. +func (e *fastEncL5Window) Reset() { + // We keep the same allocs, since we are compressing the same block sizes. + if cap(e.hist) < allocHistory { + e.hist = make([]byte, 0, allocHistory) + } + + // We offset current position so everything will be out of reach. + // If we are above the buffer reset it will be cleared anyway since len(hist) == 0. + if e.cur <= int32(bufferReset) { + e.cur += e.maxOffset + int32(len(e.hist)) + } + e.hist = e.hist[:0] +} + +func (e *fastEncL5Window) addBlock(src []byte) int32 { + // check if we have space already + maxMatchOffset := e.maxOffset + + if len(e.hist)+len(src) > cap(e.hist) { + if cap(e.hist) == 0 { + e.hist = make([]byte, 0, allocHistory) + } else { + if cap(e.hist) < int(maxMatchOffset*2) { + panic("unexpected buffer size") + } + // Move down + offset := int32(len(e.hist)) - maxMatchOffset + copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + e.cur += offset + e.hist = e.hist[:maxMatchOffset] + } + } + s := int32(len(e.hist)) + e.hist = append(e.hist, src...) + return s +} + +// matchlen will return the match length between offsets and t in src. +// The maximum length returned is maxMatchLength - 4. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastEncL5Window) matchlen(s, t int32, src []byte) int32 { + if debugDecode { + if t >= s { + panic(fmt.Sprint("t >=s:", t, s)) + } + if int(s) >= len(src) { + panic(fmt.Sprint("s >= len(src):", s, len(src))) + } + if t < 0 { + panic(fmt.Sprint("t < 0:", t)) + } + if s-t > e.maxOffset { + panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) + } + } + s1 := int(s) + maxMatchLength - 4 + if s1 > len(src) { + s1 = len(src) + } + + // Extend the match to be as long as possible. + return int32(matchLen(src[s:s1], src[t:])) +} + +// matchlenLong will return the match length between offsets and t in src. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastEncL5Window) matchlenLong(s, t int32, src []byte) int32 { + if debugDeflate { + if t >= s { + panic(fmt.Sprint("t >=s:", t, s)) + } + if int(s) >= len(src) { + panic(fmt.Sprint("s >= len(src):", s, len(src))) + } + if t < 0 { + panic(fmt.Sprint("t < 0:", t)) + } + if s-t > e.maxOffset { + panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) + } + } + // Extend the match to be as long as possible. + return int32(matchLen(src[s:], src[t:])) +} diff --git a/flate/reader_test.go b/flate/reader_test.go index bc83c1f1d9..37e9b912fe 100644 --- a/flate/reader_test.go +++ b/flate/reader_test.go @@ -81,6 +81,7 @@ const ( speed = BestSpeed default_ = DefaultCompression compress = BestCompression + oneK = -1024 ) func BenchmarkDecodeDigitsSpeed1e4(b *testing.B) { benchmarkDecode(b, digits, speed, 1e4) } diff --git a/flate/writer_test.go b/flate/writer_test.go index 766ab6b900..22d006ce34 100644 --- a/flate/writer_test.go +++ b/flate/writer_test.go @@ -217,6 +217,10 @@ func BenchmarkEncodeTwainSL1e4(b *testing.B) { benchmarkStatelessEncoder( func BenchmarkEncodeTwainSL1e5(b *testing.B) { benchmarkStatelessEncoder(b, twain, 1e5) } func BenchmarkEncodeTwainSL1e6(b *testing.B) { benchmarkStatelessEncoder(b, twain, 1e6) } +func BenchmarkEncodeTwain1024Win1e4(b *testing.B) { benchmarkEncoder(b, twain, oneK, 1e4) } +func BenchmarkEncodeTwain1024Win1e5(b *testing.B) { benchmarkEncoder(b, twain, oneK, 1e5) } +func BenchmarkEncodeTwain1024Win1e6(b *testing.B) { benchmarkEncoder(b, twain, oneK, 1e6) } + func benchmarkStatelessEncoder(b *testing.B, testfile, n int) { b.SetBytes(int64(n)) buf0, err := os.ReadFile(testfiles[testfile]) diff --git a/gzip/gzip.go b/gzip/gzip.go index 26203851bd..5bc720593e 100644 --- a/gzip/gzip.go +++ b/gzip/gzip.go @@ -74,6 +74,27 @@ func NewWriterLevel(w io.Writer, level int) (*Writer, error) { return z, nil } +// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow. +const MinCustomWindowSize = flate.MinCustomWindowSize + +// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow. +const MaxCustomWindowSize = flate.MaxCustomWindowSize + +// NewWriterWindow returns a new Writer compressing data with a custom window size. +// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize. +func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) { + if windowSize < MinCustomWindowSize { + return nil, errors.New("gzip: requested window size less than MinWindowSize") + } + if windowSize > MaxCustomWindowSize { + return nil, errors.New("gzip: requested window size bigger than MaxCustomWindowSize") + } + + z := new(Writer) + z.init(w, -windowSize) + return z, nil +} + func (z *Writer) init(w io.Writer, level int) { compressor := z.compressor if level != StatelessCompression { diff --git a/gzip/gzip_test.go b/gzip/gzip_test.go index e1ebb5a178..4c7992aa2b 100644 --- a/gzip/gzip_test.go +++ b/gzip/gzip_test.go @@ -7,6 +7,7 @@ package gzip import ( "bufio" "bytes" + "fmt" "io" "math/rand" "os" @@ -252,7 +253,7 @@ func testFile(i, level int, t *testing.T) { br := bytes.NewBuffer(testbuf) var buf bytes.Buffer - w, err := NewWriterLevel(&buf, DefaultCompression) + w, err := NewWriterLevel(&buf, level) if err != nil { t.Fatal(err) } @@ -309,6 +310,56 @@ func TestFile200(t *testing.T) { testFile(200, BestSpeed, t) } +func TestFileWindow(t *testing.T) { + for sz := MinCustomWindowSize; sz <= MaxCustomWindowSize; sz *= 2 { + t.Run(fmt.Sprint(sz), func(t *testing.T) { + testFileWindow(1, sz, t) + }) + } +} + +func testFileWindow(i, window int, t *testing.T) { + dat, _ := os.ReadFile("testdata/test.json") + dl := len(dat) + if len(testbuf) != i*dl { + // Make results predictable + testbuf = make([]byte, i*dl) + for j := 0; j < i; j++ { + copy(testbuf[j*dl:j*dl+dl], dat) + } + } + + br := bytes.NewBuffer(testbuf) + var buf bytes.Buffer + w, err := NewWriterWindow(&buf, window) + if err != nil { + t.Fatal(err) + } + n, err := io.Copy(w, br) + if err != nil { + t.Fatal(err) + } + if int(n) != len(testbuf) { + t.Fatal("Short write:", n, "!=", testbuf) + } + err = w.Close() + if err != nil { + t.Fatal(err) + } + t.Logf("size: %d bytes", buf.Len()) + r, err := NewReader(&buf) + if err != nil { + t.Fatal(err.Error()) + } + decoded, err := io.ReadAll(r) + if err != nil { + t.Fatal(err.Error()) + } + if !bytes.Equal(testbuf, decoded) { + t.Errorf("decoded content does not match.") + } +} + func testBigGzip(i int, t *testing.T) { if len(testbuf) != i { // Make results predictable @@ -385,7 +436,7 @@ func TestDeterministicL7(t *testing.T) { testDeterm(7, t) } func TestDeterministicL8(t *testing.T) { testDeterm(8, t) } func TestDeterministicL9(t *testing.T) { testDeterm(9, t) } -func testDeterm(i int, t *testing.T) { +func testDeterm(level int, t *testing.T) { var length = 500000 if testing.Short() { length = 100000 @@ -398,7 +449,7 @@ func testDeterm(i int, t *testing.T) { br := bytes.NewBuffer(t1) var b1 bytes.Buffer - w, err := NewWriterLevel(&b1, i) + w, err := NewWriterLevel(&b1, level) if err != nil { t.Fatal(err) } @@ -419,7 +470,7 @@ func testDeterm(i int, t *testing.T) { br2 := bytes.NewBuffer(t2) var b2 bytes.Buffer - w2, err := NewWriterLevel(&b2, i) + w2, err := NewWriterLevel(&b2, level) if err != nil { t.Fatal(err) } @@ -445,7 +496,7 @@ func testDeterm(i int, t *testing.T) { b2b := b2.Bytes() if !bytes.Equal(b1b, b2b) { - t.Fatalf("Level %d did not produce deterministric result, len(a) = %d, len(b) = %d", i, len(b1b), len(b2b)) + t.Fatalf("Level %d did not produce deterministric result, len(a) = %d, len(b) = %d", level, len(b1b), len(b2b)) } }