-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: add trunk compress for storedFields * feat: use zstd replace snappy * feat: compress docValue * feat: reback the docValue compress * feat: packed docNum and Offset for docValue * doc: update go.mod * feat: packed numeric of posting list * feat: compress numeric of posting list * feat: packed numeric of posting list * feat: compress intcoder * feat: run optimize on bitmap * feat: optimize document values chunk * doc: add author * style: change implement * fix: tests * test: add test for document coder * style: format code * update trunk to chunk * update sort of Authors * rename BufferSize to Size and remove Close method * update version * fix panic when search memory * rename variables
- Loading branch information
1 parent
09719ef
commit 8b0e8c2
Showing
20 changed files
with
575 additions
and
182 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,4 +7,5 @@ | |
# | ||
# Please keep the list sorted. | ||
|
||
Hengfei Yang <hengfei.yang@gmail.com> | ||
Marty Schoch <marty.schoch@gmail.com> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
package ice | ||
|
||
import ( | ||
"bytes" | ||
"encoding/binary" | ||
"io" | ||
) | ||
|
||
const defaultDocumentChunkSize uint32 = 128 | ||
|
||
type chunkedDocumentCoder struct { | ||
chunkSize uint64 | ||
w io.Writer | ||
buf *bytes.Buffer | ||
metaBuf []byte | ||
n uint64 | ||
bytes uint64 | ||
compressed []byte | ||
offsets []uint64 | ||
} | ||
|
||
func newChunkedDocumentCoder(chunkSize uint64, w io.Writer) *chunkedDocumentCoder { | ||
c := &chunkedDocumentCoder{ | ||
chunkSize: chunkSize, | ||
w: w, | ||
} | ||
c.buf = bytes.NewBuffer(nil) | ||
c.metaBuf = make([]byte, binary.MaxVarintLen64) | ||
c.offsets = append(c.offsets, 0) | ||
return c | ||
} | ||
|
||
func (c *chunkedDocumentCoder) Add(docNum uint64, meta, data []byte) (int, error) { | ||
var wn, n int | ||
var err error | ||
n = binary.PutUvarint(c.metaBuf, uint64(len(meta))) | ||
if n, err = c.writeToBuf(c.metaBuf[:n]); err != nil { | ||
return 0, err | ||
} | ||
wn += n | ||
n = binary.PutUvarint(c.metaBuf, uint64(len(data))) | ||
if n, err = c.writeToBuf(c.metaBuf[:n]); err != nil { | ||
return 0, err | ||
} | ||
wn += n | ||
if n, err = c.writeToBuf(meta); err != nil { | ||
return 0, err | ||
} | ||
wn += n | ||
if n, err = c.writeToBuf(data); err != nil { | ||
return 0, err | ||
} | ||
wn += n | ||
|
||
return wn, c.newLine() | ||
} | ||
|
||
func (c *chunkedDocumentCoder) writeToBuf(data []byte) (int, error) { | ||
return c.buf.Write(data) | ||
} | ||
|
||
func (c *chunkedDocumentCoder) newLine() error { | ||
c.n++ | ||
if c.n%c.chunkSize != 0 { | ||
return nil | ||
} | ||
return c.flush() | ||
} | ||
|
||
func (c *chunkedDocumentCoder) flush() error { | ||
if c.buf.Len() > 0 { | ||
var err error | ||
c.compressed, err = ZSTDCompress(c.compressed[:cap(c.compressed)], c.buf.Bytes(), ZSTDCompressionLevel) | ||
if err != nil { | ||
return err | ||
} | ||
n, err := c.w.Write(c.compressed) | ||
if err != nil { | ||
return err | ||
} | ||
c.bytes += uint64(n) | ||
c.buf.Reset() | ||
} | ||
c.offsets = append(c.offsets, c.bytes) | ||
return nil | ||
} | ||
|
||
func (c *chunkedDocumentCoder) Write() error { | ||
// flush first | ||
if err := c.flush(); err != nil { | ||
return err | ||
} | ||
var err error | ||
var wn, n int | ||
// write chunk offsets | ||
for _, offset := range c.offsets { | ||
n = binary.PutUvarint(c.metaBuf, offset) | ||
if _, err = c.w.Write(c.metaBuf[:n]); err != nil { | ||
return err | ||
} | ||
wn += n | ||
} | ||
// write chunk offset length | ||
err = binary.Write(c.w, binary.BigEndian, uint32(wn)) | ||
if err != nil { | ||
return err | ||
} | ||
// write chunk num | ||
err = binary.Write(c.w, binary.BigEndian, uint32(len(c.offsets))) | ||
if err != nil { | ||
return err | ||
} | ||
return nil | ||
} | ||
|
||
func (c *chunkedDocumentCoder) Reset() { | ||
c.compressed = c.compressed[:0] | ||
c.offsets = c.offsets[:0] | ||
c.n = 0 | ||
c.bytes = 0 | ||
c.buf.Reset() | ||
} | ||
|
||
// Size returns buffer size of current chunk | ||
func (c *chunkedDocumentCoder) Size() uint64 { | ||
return uint64(c.buf.Len()) | ||
} | ||
|
||
// Len returns chunks num | ||
func (c *chunkedDocumentCoder) Len() int { | ||
return len(c.offsets) | ||
} | ||
|
||
// Len returns chunks num | ||
func (c *chunkedDocumentCoder) Offsets() []uint64 { | ||
m := make([]uint64, 0, len(c.offsets)) | ||
m = append(m, c.offsets...) | ||
return m | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
package ice | ||
|
||
import ( | ||
"bytes" | ||
"testing" | ||
) | ||
|
||
func TestChunkedDocumentCoder(t *testing.T) { | ||
tests := []struct { | ||
chunkSize uint64 | ||
docNums []uint64 | ||
metas [][]byte | ||
datas [][]byte | ||
expected []byte | ||
expectedChunkNum int | ||
}{ | ||
{ | ||
chunkSize: 1, | ||
docNums: []uint64{0}, | ||
metas: [][]byte{{0}}, | ||
datas: [][]byte{[]byte("bluge")}, | ||
expected: []byte{ | ||
0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x41, | ||
0x0, 0x0, 0x1, 0x5, 0x0, 0x62, 0x6c, 0x75, 0x67, 0x65, 0x2b, 0x30, 0x97, 0x33, 0x0, 0x15, 0x15, | ||
0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x3, | ||
}, | ||
expectedChunkNum: 3, // left, chunk, right | ||
}, | ||
{ | ||
chunkSize: 1, | ||
docNums: []uint64{0, 1}, | ||
metas: [][]byte{{0}, {1}}, | ||
datas: [][]byte{[]byte("upside"), []byte("scorch")}, | ||
expected: []byte{ | ||
0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x49, | ||
0x0, 0x0, 0x1, 0x6, 0x0, 0x75, 0x70, 0x73, 0x69, 0x64, 0x65, | ||
0x36, 0x6e, 0x7e, 0x39, 0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x49, | ||
0x0, 0x0, 0x1, 0x6, 0x1, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68, | ||
0x8f, 0x83, 0xa3, 0x37, 0x0, 0x16, 0x2c, 0x2c, | ||
0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x4, | ||
}, | ||
expectedChunkNum: 4, // left, chunk, chunk, right | ||
}, | ||
} | ||
|
||
for _, test := range tests { | ||
var actual bytes.Buffer | ||
cic := newChunkedDocumentCoder(test.chunkSize, &actual) | ||
for i, docNum := range test.docNums { | ||
_, err := cic.Add(docNum, test.metas[i], test.datas[i]) | ||
if err != nil { | ||
t.Fatalf("error adding to documentcoder: %v", err) | ||
} | ||
} | ||
err := cic.Write() | ||
if err != nil { | ||
t.Fatalf("error writing: %v", err) | ||
} | ||
if !bytes.Equal(test.expected, actual.Bytes()) { | ||
t.Errorf("got:%s, expected:%s", actual.String(), string(test.expected)) | ||
} | ||
if test.expectedChunkNum != cic.Len() { | ||
t.Errorf("got:%d, expected:%d", cic.Len(), test.expectedChunkNum) | ||
} | ||
} | ||
} | ||
|
||
func TestChunkedDocumentCoders(t *testing.T) { | ||
chunkSize := uint64(2) | ||
docNums := []uint64{0, 1, 2, 3, 4, 5} | ||
metas := [][]byte{ | ||
{0}, | ||
{1}, | ||
{2}, | ||
{3}, | ||
{4}, | ||
{5}, | ||
} | ||
datas := [][]byte{ | ||
[]byte("scorch"), | ||
[]byte("does"), | ||
[]byte("better"), | ||
[]byte("than"), | ||
[]byte("upside"), | ||
[]byte("down"), | ||
} | ||
chunkNum := 5 // left, chunk, chunk, chunk, right | ||
|
||
var actual1, actual2 bytes.Buffer | ||
// chunkedDocumentCoder that writes out at the end | ||
cic1 := newChunkedDocumentCoder(chunkSize, &actual1) | ||
// chunkedContentCoder that writes out in chunks | ||
cic2 := newChunkedDocumentCoder(chunkSize, &actual2) | ||
|
||
for i, docNum := range docNums { | ||
_, err := cic1.Add(docNum, metas[i], datas[i]) | ||
if err != nil { | ||
t.Fatalf("error adding to documentcoder: %v", err) | ||
} | ||
_, err = cic2.Add(docNum, metas[i], datas[i]) | ||
if err != nil { | ||
t.Fatalf("error adding to documentcoder: %v", err) | ||
} | ||
} | ||
|
||
err := cic1.Write() | ||
if err != nil { | ||
t.Fatalf("error writing: %v", err) | ||
} | ||
err = cic2.Write() | ||
if err != nil { | ||
t.Fatalf("error writing: %v", err) | ||
} | ||
|
||
if !bytes.Equal(actual1.Bytes(), actual2.Bytes()) { | ||
t.Errorf("%s != %s", actual1.String(), actual2.String()) | ||
} | ||
if chunkNum != cic1.Len() { | ||
t.Errorf("got:%d, expected:%d", cic1.Len(), chunkNum) | ||
} | ||
if chunkNum != cic2.Len() { | ||
t.Errorf("got:%d, expected:%d", cic2.Len(), chunkNum) | ||
} | ||
} |
Oops, something went wrong.